## Load Data

In [84]:
def read_file(path):
    """
    read and return all data in a file
    """
    with open(path, 'r') as f:
        return f.read()

def load_data():
    """
    Load the data into emails and labels lists. 
    Emails list contains elements that store the email text.
    Labels list contains elements that is either 0 (not spam) or 1 (spam), 
    and indicates the spam/ham label for the corresponding element in the emails lists
    """
    # load all data from file
    data_path = "data/SpamDetectionData.txt"
    all_data = read_file(data_path)
    
    # split the data into lines, each line is a single sample
    all_lines = all_data.split('\n')
       
    emails = []
    labels = []
    for line in all_lines:
        line = line.replace("<p>", "").replace("</p>","")
        if line[0:4] == 'Spam':
            labels.append(1)
            emails.append(line[5:])
            pass
        elif line[0:3] == 'Ham':
            labels.append(0)
            emails.append(line[4:])
            pass
        else:
            # ignore markers, empty lines and other lines that aren't valid sample
            # print('ignore: "{}"'.format(line));
            pass
        
    return emails, labels
    
features, labels = load_data()

print("total no. of samples: {}".format(len(labels)))
print("total no. of spam samples: {}".format(labels.count(1)))
print("total no. of ham samples: {}".format(labels.count(0)))
print("example feature: {}".format(features[0][0:]))
print("example label: {}".format(labels[0]))

TypeError: replace() takes at least 2 arguments (1 given)

## Preprocess Data

In [79]:
features, labels = load_data()

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# split data into training / test sets
features_train, features_test, labels_train, labels_test = train_test_split(
    features, 
    labels, 
    test_size=0.1, # use 10% for testing
    random_state=42)

print("no. of features train: {}".format(len(features_train)))
print("no. of labels train: {}".format(len(labels_train)))
print("no. of features test: {}".format(len(features_test)))
print("no. of labels test: {}".format(len(labels_test)))

# vectorize into tfidf matrix

### text vectorization--go from strings to lists of numbers
vectorizer = TfidfVectorizer(
    input='content',
    stop_words='english')
features_train_transformed = vectorizer.fit_transform(features_train)
features_test_transformed  = vectorizer.transform(features_test)




no. of features train: 1890
no. of labels train: 1890
no. of features test: 210
no. of labels test: 210


In [83]:

from sklearn.naive_bayes import MultinomialNB


clf = MultinomialNB()
clf.fit(features_train_transformed, labels_train)
print(clf.score(features_test_transformed, labels_test))
#import pickle
#clf = pickle.load(open("save.p","rb"))
#print(clf.score(features_test, labels_test))
#print("predict", clf.predict(features_test[0:1])[0])
#print("actual", labels_test[0:1][0])




1.0
