# Load email contents and author label data
Chris has label 1 and Sara has label 0. The objective is to find out the author provided email content.

In [1]:
import pickle
import numpy as np
from collections import Counter

with open("data/word_data.pkl", "rb") as file:
    word_data = pickle.load(file)

with open("data/email_authors.pkl", "rb") as file:
    authors = pickle.load(file)

print('Number of emails: ', len(word_data))
print('Number of labels: ', len(authors))

authors_np = np.array(authors)
print("Number of Chris training emails: ", (authors_np == 1).sum())
print("Number of Sara training emails: ", (authors_np == 0).sum())

Number of emails:  17578
Number of labels:  17578
Number of Chris training emails:  8801
Number of Sara training emails:  8777


# Split into train-test set

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(word_data, authors, test_size=0.1, random_state=42)

print('Number of training feature dataset: ', len(X_train))
print('Number of training target dataset: ', len(y_train))
print('Number of testing feature dataset: ', len(X_test))
print('Number of testing target dataset: ', len(y_test))

Number of training feature dataset:  15820
Number of training target dataset:  15820
Number of testing feature dataset:  1758
Number of testing target dataset:  1758


# Vectorize into tfidf matrix

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed  = vectorizer.transform(X_test)

print('Dimension of transformed training feature matrix: ', X_train_transformed.shape)
print('Dimension of transformed testing feature matrix: ', X_test_transformed.shape)

Dimension of transformed training feature matrix:  (15820, 37851)
Dimension of transformed testing feature matrix:  (1758, 37851)


# Select most helpful features (top 10%)

In [4]:
from sklearn.feature_selection import SelectPercentile, f_classif

selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X_train_transformed, y_train)
X_train_transformed = selector.transform(X_train_transformed).toarray()
X_test_transformed  = selector.transform(X_test_transformed).toarray()

print('Dimension of optimized training feature matrix: ', X_train_transformed.shape)
print('Dimension of optimized testing feature matrix: ', X_test_transformed.shape)

Dimension of optimized training feature matrix:  (15820, 3785)
Dimension of optimized testing feature matrix:  (1758, 3785)


# Update train-test variables, convert them to numpy arrays

In [5]:
X_train = X_train_transformed
X_test = X_test_transformed
y_train = np.array(y_train)
y_test = np.array(y_test)

print('Dimension of training feature dataset: ', X_train.shape)
print('Dimension of training target dataset: ', y_train.shape)
print('Dimension of testing feature dataset: ', X_test.shape)
print('Dimension of testing target dataset: ', y_test.shape)

Dimension of training feature dataset:  (15820, 3785)
Dimension of training target dataset:  (15820,)
Dimension of testing feature dataset:  (1758, 3785)
Dimension of testing target dataset:  (1758,)


# Create naives bayes model

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

model = MultinomialNB()
model.fit(X_train, y_train)
print('Model: ', model)

y_pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Confusion matrix: ')
print(confusion_matrix(y_test, y_pred))

Model:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy:  0.982935153584
Confusion matrix: 
[[883  10]
 [ 20 845]]
