## Load Data

In [172]:
def read_file(path):
    """
    read and return all data in a file
    """
    with open(path, 'r') as f:
        return f.read()

def load_data():
    """
    load and return the data in features and labels lists
    each item in features contains the raw email text
    each item in labels identifies whether the corresponding item in features is spam (1) or ham (0)
    """
    # load all data from file
    data_path = "data/SpamDetectionData.txt"
    all_data = read_file(data_path)
    
    # split the data into lines, each line is a single sample
    all_lines = all_data.split('\n')

    # each line in the file is a sample and has the following format
    # it begins with either "Spam," or "Ham,", and follows by the actual text of the email
    # e.g. Spam,<p>His honeyed and land....
    
    # extract the feature (email text) and label (spam or ham) from each line
    features = []
    labels = []
    for line in all_lines:
        if line[0:4] == 'Spam':
            labels.append(1)
            features.append(line[5:])
            pass
        elif line[0:3] == 'Ham':
            labels.append(0)
            features.append(line[4:])
            pass
        else:
            # ignore markers, empty lines and other lines that aren't valid sample
            # print('ignore: "{}"'.format(line));
            pass
    
    return features, labels
    
features, labels = load_data()

print("total no. of samples: {}".format(len(labels)))
print("total no. of spam samples: {}".format(labels.count(1)))
print("total no. of ham samples: {}".format(labels.count(0)))

print("\n")
print("example feature: {}".format(features[0][0:]))
print("example label: {} ({})".format(labels[0], 'spam' if labels[0] else 'ham'))

total no. of samples: 2100
total no. of spam samples: 1043
total no. of ham samples: 1057


example feature: <p>But could then once pomp to nor that glee glorious of deigned. The vexed times childe none native. To he vast now in to sore nor flow and most fabled. The few tis to loved vexed and all yet yea childe. Fulness consecrate of it before his a a a that.</p><p>Mirthful and and pangs wrong. Objects isle with partings ancient made was are. Childe and gild of all had to and ofttimes made soon from to long youth way condole sore.</p>
example label: 1 (spam)


## Preprocess Data

In [176]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# load features and labels
features, labels = load_data()

# split data into training / test sets
features_train, features_test, labels_train, labels_test = train_test_split(
    features, 
    labels, 
    test_size=0.1, # use 10% for testing
    random_state=42)

print("no. of features train: {}".format(len(features_train)))
print("no. of labels train: {}".format(len(labels_train)))
print("no. of features test: {}".format(len(features_test)))
print("no. of labels test: {}".format(len(labels_test)))

# vectorize email text into tfidf matrix
# TfidfVectorizer converts collection of raw documents to a matrix of TF-IDF features.
# It's equivalent to CountVectorizer followed by TfidfTransformer.
vectorizer = TfidfVectorizer(
    input='content',     # input is actual text
    lowercase=True,      # convert to lower case before tokenizing
    stop_words='english' # remove stop words
)
features_train_transformed = vectorizer.fit_transform(features_train)
features_test_transformed  = vectorizer.transform(features_test)


no. of features train: 1890
no. of labels train: 1890
no. of features test: 210
no. of labels test: 210


In [203]:
from sklearn.naive_bayes import MultinomialNB
import pickle

def save(vectorizer, clf):
    '''
    save classifier from disk
    '''
    with open('clf.pkl', 'wb') as file:
        pickle.dump((vectorizer, clf), file)
        
def load():
    '''
    load classifier from disk
    '''
    with open('clf.pkl', 'rb') as file:
      vectorizer, clf = pickle.load(file)
    return vectorizer, clf

# train a classifier
classifier = MultinomialNB()
classifier.fit(features_train_transformed, labels_train)

# save classifier
save(vectorizer, classifier)

# score the classifier accuracy
print("classifier score {.3f}".format(classifier.score(features_test_transformed, labels_test) * 100))



AttributeError: 'numpy.float64' object has no attribute '3f'