# Mini Project - Email classifier with Support Vector Machine

In [1]:
params = {
    "authors_file_path": "https://github.com/udacity/ud120-projects/blob/master/tools/email_authors.pkl",
    "word_file_path": "https://github.com/udacity/ud120-projects/blob/master/tools/word_data.pkl",
    "test_size": 0.1,
    "random_state": 42
}

In [2]:
from marvin_python_toolbox.common.data import MarvinData
import pickle
import cPickle

### the words (features) and authors (labels), already largely preprocessed
### this preprocessing will be repeated in the text learning mini-project

print "Downloading files ...."
authors_file_path = MarvinData.download_file(params["authors_file_path"])
word_file_path = MarvinData.download_file(params["word_file_path"])

print "Loading files ...."
authors_file_handler = open(authors_file_path, "r")
authors = pickle.load(authors_file_handler)
authors_file_handler.close()

words_file_handler = open(word_file_path, "r")
word_data = cPickle.load(words_file_handler)
words_file_handler.close()

initial_dataset = {
    "word_data": word_data,
    "authors": authors
}

print "Done!"

Downloading files ....
Loading files ....
Done!


In [3]:
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

### test_size is the percentage of events assigned to the test set
### (remainder go into training)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    initial_dataset["word_data"], 
    initial_dataset["authors"], 
    test_size=params["test_size"], 
    random_state=params["random_state"])

### text vectorization--go from strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train_transformed = vectorizer.fit_transform(features_train)
features_test_transformed  = vectorizer.transform(features_test)

### feature selection, because text is super high dimensional and 
### can be really computationally chewy as a result
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(features_train_transformed, labels_train)

features_train_transformed = selector.transform(features_train_transformed).toarray()
features_test_transformed  = selector.transform(features_test_transformed).toarray()

### info on the data
print "no. of Chris training emails:", sum(labels_train)
print "no. of Sara training emails:", len(labels_train) - sum(labels_train)

dataset = {
    "features_train_transformed": features_train_transformed,
    "features_test_transformed": features_test_transformed,
    "labels_train": labels_train,
    "labels_test": labels_test
}



no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [30]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project. 

    Use a Support Vector Machine to identify emails by their authors
    
    authors and labels:
    Sara has label 0
    Chris has label 1
"""
from sklearn.svm import SVC
from time import time

print "Starting traning process..."
t0 = time()

clf = SVC(kernel="rbf", C=10000)

clf.fit(dataset["features_train_transformed"], dataset["labels_train"])

print "training time:", round(time()-t0, 3), "s"

model = clf

print "Done!"

Starting traning process...
training time: 137.849 s
Done!


In [31]:
from sklearn.metrics import accuracy_score
from time import time

t0 = time()
y_pred = model.predict(dataset["features_test_transformed"])
print "prediction time:", round(time()-t0, 3), "s"

accuracy_score = accuracy_score(dataset["labels_test"], y_pred)

print "the accuracy score is ", accuracy_score
metrics = {"accuracy_score": accuracy_score}

prediction time: 13.984 s
the accuracy score is  0.990898748578
