In [13]:
import os, sys
import pandas as pd
import numpy as np
import re
import nltk

from nltk.probability import *
from nltk import NaiveBayesClassifier
from nltk.tokenize import wordpunct_tokenize
from collections import defaultdict
#------------------------------------get files from directory------------------------------------------------------------
def getFilesFromDir (path):
    dir_content = os.listdir(path)
    dir_clean = filter(lambda x: (".DS_Store" not in x) and ("cmds" not in x), dir_content)
    msg = map(lambda x: getMessage(path + '/' + x), dir_clean)
    return msg
#------------------------------------get message from files--------------------------------------------------------------
def getMessage (path):
    fo = open (path)
    lines = fo.readlines()
    fo.close()
#get message body
    lines = ''.join(lines)
    body_position_index = lines.index('\n\n')
    lines = lines[body_position_index + 1: len(lines)]
    return lines
#--------------------------parameter setting-----------------------------------------------------------------------------
#set amount of samples to take, 500 samples equals to the number of spam samples
amountOfSamplesPerSet = 500
amountOfFeaturesPerSet = 400#features can be 50, 100, 200, 400...
#file path
base_path = "/Users/yuliang/Downloads/naive_bayes"
ham_path = base_path + "/easy_ham"
ham_path2 = base_path + "/easy_ham_2"
spam_path = base_path + "/spam"
spam_path2 = base_path + "/spam_2"
#get files from directory
hamTrainDir = getFilesFromDir(ham_path)
hamTrainDir = hamTrainDir[:amountOfSamplesPerSet]
hamTestDir = getFilesFromDir(ham_path2)
spamTrainDir = getFilesFromDir(spam_path)
spamTrainDir = spamTrainDir[:amountOfSamplesPerSet]
spamTestDir = getFilesFromDir(spam_path2)
#-------------------------get message words from data--------------------------------------------------------------------
def getMessageWords(file_msg, stopwords = []):
    file_msg = ''.join(file_msg)
    file_msg = re.sub('3D', '', file_msg)
    file_msg = re.sub(r'([^\s\w]|_)+', '', file_msg)
    
    file_msg_words = wordpunct_tokenize(file_msg.replace('=\n', '').lower())
    file_msg_words = filter(lambda x: x not in stopwords, file_msg_words)
    file_msg_words = [w for w in file_msg_words if re.search('[a-zA-Z]', w) and len(w) > 1]
    return file_msg_words

def getMessageWords2(file_msg, stopwords = []):
    file_msg = re.sub('3D', '', file_msg)
    file_msg = re.sub(r'([^\s\w]|_)+', '', file_msg)
    
    file_msg_words = wordpunct_tokenize(file_msg.replace('=\n', '').lower())
    file_msg_words = filter(lambda x: x not in stopwords, file_msg_words)
    file_msg_words = [w for w in file_msg_words if re.search('[a-zA-Z]', w) and len(w) > 1]
    return file_msg_words
#-------------------------get stop words---------------------------------------------------------------------------------
def getStopWords (path):
    fo = open (path)
    lines = fo.readlines()
    lines_clean = map(lambda x: str.replace(x, '\n', ''), lines)
    fo.close()
    return lines_clean
stop_words_given = getStopWords ("/Users/yuliang/Downloads/naive_bayes/stopwords.txt")
#-----------------------------------Term Document Matrix-----------------------------------------------------------------
#get features
def getFeatures(file_msg, **kwargs):
    file_msg_words = getMessageWords(file_msg, **kwargs)
    words_list = nltk.FreqDist(file_msg_words)
    words_list_common = words_list.most_common()
    topFeatures = map(lambda x: x[0], words_list_common[:amountOfFeaturesPerSet])
    return topFeatures

#filter stop words
hamFeatures = getFeatures(hamTrainDir, stopwords = stop_words_given)
spamFeatures = getFeatures(spamTrainDir, stopwords = stop_words_given)

#combine features
allFeatures = set(hamFeatures + spamFeatures)
allFeatures = list(allFeatures)

#feature label
def getFeaturesLabel(file_msg, label, allFeature, feature_extractor, **kwargs):
    features = map(lambda x: feature_extractor(x, allFeature, **kwargs), file_msg)
    features_label = map(lambda x: (x, label), features)
    return features_label

#words indicator
def wordsIndicator(file_msg, allFeature, **kwargs):
    file_msg_words = getMessageWords2(file_msg, **kwargs)
    featureWords = allFeature
    features_dict = defaultdict(list) 
    for w in file_msg_words:
        if w in allFeature:
            features_dict[w] = True
    return features_dict
#------------------------------------Train and test dataset--------------------------------------------------------------
hamTrainFeature = getFeaturesLabel(hamTrainDir, 'ham', allFeatures, wordsIndicator, stopwords = stop_words_given)
spamTrainFeature = getFeaturesLabel(spamTrainDir, 'spam', allFeatures, wordsIndicator, stopwords = stop_words_given)
trainFeature = hamTrainFeature + spamTrainFeature
hamTestFeature = getFeaturesLabel(hamTestDir, 'ham', allFeatures, wordsIndicator, stopwords = stop_words_given)
spamTestFeature = getFeaturesLabel(spamTestDir, 'spam', allFeatures, wordsIndicator, stopwords = stop_words_given)
#------------------------------------Naive Bayes Classifier--------------------------------------------------------------
# Train the naive bayes classifier
naive_bayes_classifier = NaiveBayesClassifier.train(trainFeature)
print ('Test Spam accuracy: %.4f' %nltk.classify.accuracy(naive_bayes_classifier, spamTestFeature))
print ('Test Ham accuracy: %.4f' %nltk.classify.accuracy(naive_bayes_classifier, hamTestFeature))


Test Spam accuracy: 0.9907
Test Ham accuracy: 0.9250
