## Machine learning project of the Udacity course "Intro to Machine Learning"
The course is [here](https://www.udacity.com/course/intro-to-machine-learning--ud120), aiming at recognizing the pattern from the Enron emails dataset.

In [1]:
## packages import
import six.moves.cPickle as pickle
import os
import re
import sys
import numpy as np
from time import time

### email author recognition
The first interesting task using the Enron emails is a classification problem -- to identify who may sent the email (from Sara or Chris here).

In [2]:
import string

def parseOutText(f):
    """ 
    Given an opened email file f, parse out all text below the
    metadata block at the top, stem the words, and
    return a string that contains all the stemmed words
    in the email (space-separated)
        
    Example use case:
    f = open("email_file_name.txt", "r")
    text = parseOutText(f)
        
    """
    f.seek(0)  # go back to beginning of file
    all_text = f.read()

    # split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        # remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        # split the text string into individual words, stem each word, and append the
        # stemmed word to words (there's a single space between each stemmed word)
        from nltk.stem.snowball import SnowballStemmer
        stemmer = SnowballStemmer("english")
        words_list = text_string.split()
        for word in words_list:
            words = words + ' ' + stemmer.stem(word)

    return words

In [3]:
word_file = './text_learning/your_word_data.pkl'
author_file = './text_learning/your_email_authors.pkl'

if not os.path.exists(word_file) or not os.path.exists(author_file):
    ## Load and process email data
    from_sara  = open("./text_learning/from_sara.txt", "r")
    from_chris = open("./text_learning/from_chris.txt", "r")

    authors = []  # to make labels from the email author
    word_data = []  # email content (stemmed words) as the data

    for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
        for path in from_person:
            path = os.path.join('.', path[:-1])
            print path
            email = open(path, "r")

            # use parseOutText to extract the text from the opened email
            words = parseOutText(email)
            
            # sig is the list of signatures that can recognize the mail's author (machine learning thus useless)
            # so we drop them
            # sig can be extracted from an overfitted model by seeing the feature importances
            # and overfitted model can be implemented from a small data set
            sig = ["sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf"]
            for s in sig:
                words = words.replace(s, "")
            
            # append the text to word_data
            word_data.append(words)

            # append a 0 to authors if email is from Sara, and 1 if email is from Chris
            if name is "sara":
                authors.append(0)
            else:
                authors.append(1)

            email.close()

    from_sara.close()
    from_chris.close()

    # save the processed data
    pickle.dump( word_data, open("./text_learning/your_word_data.pkl", "w") )
    pickle.dump( authors, open("./text_learning/your_email_authors.pkl", "w") )
    print "emails processed and saved."


with open("./text_learning/your_word_data.pkl") as f:
    word_data = pickle.load(f)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")
vectorizer.fit(word_data)
features = vectorizer.get_feature_names()
print "Num of samples in word_data:", len(word_data) # 17578 examples
print "Num of different words:", len(features) # 38755 features

Num of samples in word_data: 17578
Num of different words: 38755


In [4]:
## Reload the data to train ML models
words_file = "./text_learning/your_word_data.pkl" 
authors_file = "./text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r") )
authors = pickle.load( open(authors_file, "r") )

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, \
    test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.1, min_df=0.01, stop_words='english')
trainset = vectorizer.fit_transform(features_train).toarray()
testset  = vectorizer.transform(features_test).toarray()

print "training set size:", trainset.shape
print "test set size:", testset.shape

training set size: (15820, 876)
test set size: (1758, 876)


In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

clf = GaussianNB()
clf.fit(trainset, labels_train)
print "Naive Bayes model evaluation is {0}%".format(clf.score(testset, labels_test) * 100)
clf = LogisticRegression()
clf.fit(trainset, labels_train)
print "Logistic regression model evaluation is {0}%".format(clf.score(testset, labels_test) * 100)
clf = AdaBoostClassifier()
clf.fit(trainset, labels_train)
print "Adaboost model evaluation is {0}%".format(clf.score(testset, labels_test) * 100)
clf = RandomForestClassifier()
clf.fit(trainset, labels_train)
print "Random forest model evaluation is {0}%".format(clf.score(testset, labels_test) * 100)

Naive Bayes model evaluation is 95.6769055745%
Logistic regression model evaluation is 97.1558589306%
Adaboost model evaluation is 92.0932878271%
Random forest model evaluation is 98.6348122867%


### Recognize the person of interest (POI) in the Enron scandal
The main project is to identify Enron employees who may have committed fraud based on the public Enron financial and email dataset.

In [6]:
def computeFraction( poi_messages, all_messages ):
    """ Given a number messages to/from POI (numerator) 
        and number of all messages to/from a person (denominator),
        return the fraction of messages to/from that person
        that are from/to a POI
    """
    import math
    poi_messages, all_messages = float(poi_messages), float(all_messages)
    if math.isnan(poi_messages) or math.isnan(all_messages):
        fraction = 0
    else:
        fraction = poi_messages / all_messages

    return fraction


def addFeature(data_dict):
    """Add the two features 'fraction_from_poi' and 'fraction_to_poi' 
       to the dataset
    """
    for name in data_dict:
        data_point = data_dict[name]

        from_poi_to_this_person = data_point["from_poi_to_this_person"]
        to_messages = data_point["to_messages"]
        fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
        data_dict[name]["fraction_from_poi"] = fraction_from_poi

        from_this_person_to_poi = data_point["from_this_person_to_poi"]
        from_messages = data_point["from_messages"]
        fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
        data_dict[name]["fraction_to_poi"] = fraction_to_poi
    
    return data_dict


def featureFormat(dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ Convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """
    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print "error: key ", feature, " not present"
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        # if all features are zero and you want to remove
        # data points that are all zero, default False
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        # if any features for a given data point are zero and
        # you want to remove data points with any zeroes, default False
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        # Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)


def targetFeatureSplit( data ):
    """ Given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list, and return the
        targets and features as separate lists
    """
    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features

In [7]:
# Select the features of the dataset to be trained
features_list = ['poi','fraction_to_poi','fraction_from_poi','shared_receipt_with_poi',
                 'bonus','total_stock_value']

with open("./final_project/final_project_dataset.pkl", "r") as f:
    my_dataset = pickle.load(f)

my_dataset.pop('TOTAL')  # remove outliers
my_dataset = addFeature(my_dataset)  # add my customed new features

from sklearn.preprocessing import MinMaxScaler
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
scaler = MinMaxScaler()
scaler.fit(features)
features = scaler.transform(features)
print "Num of features:", len(features_list)-1
print "Num of data points:", len(labels)

Num of features: 5
Num of data points: 131


#### Model training
Now let's start training our models. Because the dataset is small, it's fast and handy to try several sklearn learning algorithms. And grid search cross-validation is used here to select the best hyper-parameters of each algorithm.

In [8]:
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.2, random_state=45)

In [9]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
params = {'C':[.01, .1, 1, 10, 100]}
cvModel = GridSearchCV(clf, param_grid=params, cv=5)
cvModel.fit(features_train, labels_train)
print "The best hyper-parameters:\n  ", cvModel.best_params_
print "The test accuracy is", cvModel.score(features_test, labels_test)

The best hyper-parameters:
{'C': 0.01}
The test accuracy is 0.925925925926


In [10]:
from sklearn.svm import SVC

clf = SVC()
params = {'kernel':('linear', 'rbf'), 'C':[.01, .1, 1, 10, 100]}
cvModel = GridSearchCV(clf, param_grid=params, cv=5)
cvModel.fit(features_train, labels_train)
print "The best hyper-parameters:\n  ", cvModel.best_params_
print "The test accuracy is", cvModel.score(features_test, labels_test)

The best hyper-parameters:
{'kernel': 'linear', 'C': 0.01}
The test accuracy is 0.925925925926


In [11]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
params = {'n_neighbors':[3,5,7], 'weights': ('uniform', 'distance')}
cvModel = GridSearchCV(clf, param_grid=params, cv=5)
cvModel.fit(features_train, labels_train)
print "The best hyper-parameters:\n  ", cvModel.best_params_
print "The test accuracy is", cvModel.score(features_test, labels_test)

The best hyper-parameters:
{'n_neighbors': 5, 'weights': 'uniform'}
The test accuracy is 0.851851851852


In [12]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
params = {'max_features':[3,4,5], 'max_depth':[3,5,7]}
cvModel = GridSearchCV(clf, param_grid=params, cv=5)
cvModel.fit(features_train, labels_train)
print "The best hyper-parameters:\n  ", cvModel.best_params_
print "The test accuracy is", cvModel.score(features_test, labels_test)

The best hyper-parameters:
{'max_features': 4, 'max_depth': 5}
The test accuracy is 0.814814814815


In [13]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=45)
params = {'n_estimators':[5, 10, 15], 'max_features':[2,4], 'max_depth':[None, 3, 5]}
cvModel = GridSearchCV(clf, param_grid=params, cv=5)
cvModel.fit(features_train, labels_train)
print "The best hyper-parameters:\n  ", cvModel.best_params_
print "The test accuracy is", cvModel.score(features_test, labels_test)

The best hyper-parameters:
{'max_features': 2, 'n_estimators': 15, 'max_depth': 3}
The test accuracy is 0.851851851852


#### Average of different splitting
To see the effect of random splitting of training set and test set, we try to see average of different splitting.

In [14]:
state = [23, 35, 42, 51, 68, 99]

accu_list = []
for s in state:
    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.2, random_state=s)
    clf = SVC()
    params = {'kernel':('linear', 'rbf'), 'C':[.01, .1, 1, 10, 100]}
    cvModel = GridSearchCV(clf, param_grid=params, cv=5)
    cvModel.fit(features_train, labels_train)
    accu_list.append(cvModel.score(features_test, labels_test))

print "The avg of test accuracy by SVM is", np.mean(accu_list)

The avg of test accuracy by SVM is 0.901234567901


In [15]:
accu_list = []
for s in state:
    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.2, random_state=s)
    clf = KNeighborsClassifier()
    params = {'n_neighbors':[3,5,7], 'weights': ('uniform', 'distance')}
    cvModel = GridSearchCV(clf, param_grid=params, cv=5)
    cvModel.fit(features_train, labels_train)
    accu_list.append(cvModel.score(features_test, labels_test))

print "The avg of test accuracy by kNN is", np.mean(accu_list)

The avg of test accuracy by kNN is 0.851851851852
