In [9]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [10]:
def make_vocabulary(root_dir):
  word_list = []  #create a list to contain all the words
  emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)] # os.listdir can output a list of emails names that folders contain
  for mail in emails:  
    with open(mail) as m:  #with - as - means assigns open(mail)'s value to m, after the end of the program, automatically exit
      for line in m:
        words = line.split() #The default separator is space. Separate the words in each line.
        word_list += words
  vocabulary = Counter(word_list)#count the number of occurence of string, and then return the value can be interpreted as a dictionary
  clean_vocabulary = list(vocabulary)#transform vocabulary as a list type

  for item in clean_vocabulary:
    if item.isalpha() == False:#The isalpha() method checks if the string consists of only letters
      del vocabulary[item]
    elif len(item) == 1:#Checks whether the length of string is 1 
      del vocabulary[item]
  vocabulary = vocabulary.most_common(3000) #Most_common () lists the n most common elements and their number
  return vocabulary

In [11]:
def get_features(mail_dir):
  files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]# os.listdir can output a list of emails names that folders contain
  features_matrix = np.zeros((len(files),3000))#Return a new array with shape is length of files,3000
  train_labels = np.zeros(len(files))#Return a new array with shape is length of files
  count = 1;
  documentN0 = 0;
  for fil in files:
    with open(fil) as fi: #with - as - means assigns open(fil)'s value to m, after the end of the program, automatically exit
      for i, line in enumerate(fi): #Enumerate() method adds a counter to an iterable and returns it in a form of enumerating object. This enumerated object can then be used directly for loops or converted into a list of tuples using the list() method
        if i ==2:
          words = line.split() #split to get words
          for word in words:
            wordID = 0  #assign wordID to word in words
            for i, d in enumerate(vocabulary):
              if d[0] == word:
                wordID = i
                features_matrix[documentN0,wordID] = words.count(word) # create word frequency matrix
      train_labels[documentN0] = 0;
      filepathTokens = fil.split('/')
      lastToken = filepathTokens[len(filepathTokens)-1]
      if lastToken.startswith("spmsg"): #the name of spam emails begins with spmsg
        train_labels[documentN0] = 1;
        count = count + 1
     documentN0 = documentN0 + 1
  return features_matrix, train_labels

In [12]:
TRAIN_DIR = 'F:/lmu data/BSAN6070/CA02/train-mails'
TEST_DIR = 'F:/lmu data/BSAN6070/CA02/test-mails'

vocabulary = make_vocabulary(TRAIN_DIR) #call the function def make_vocabulary(root_dir)

print ("reading and processing emails from TRAIN and TEST folders")
features_matrix, labels = get_features(TRAIN_DIR)  #call the function def get_features(mail_dir)
test_features_matrix, test_labels = get_features(TEST_DIR)

model = GaussianNB() #use GaussianNB

print ("Training Model using Gaussian Naibe Bayes algorithm .....")
model.fit(features_matrix, labels)#Used to perform the training process
print ("Training completed")
print ("testing trained model to predict Test Data labels")
predicted_labels = model.predict(test_features_matrix)
print ("Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:")
print (accuracy_score(test_labels, predicted_labels)) #compare the accuracy score for predicted labels. Accuracy score is just percentage of correct predictions. 

reading and processing emails from TRAIN and TEST folders
Training Model using Gaussian Naibe Bayes algorithm .....
Training completed
testing trained model to predict Test Data labels
Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:
1.0
