# Saving the email features created with TfIdf to a dict that can be merged with the financial features

Load tuple list created in [vectorize_email_features](vectorize_email_features.ipynb) as `label_email_text` and the original project data as `data_dict`

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

HOME_PATH = os.path.expanduser('~')
DATA_PATH = os.path.join(HOME_PATH, 'Desktop', 'raw_data', 'ml')

label_email_text_path = os.path.join(DATA_PATH, 'label_email_text.pkl')
with open(label_email_text_path, 'rb') as f:
    label_email_text = pickle.load(f)

with open('data/final_project_dataset.pkl', 'rb') as f:
    data_dict = pickle.load(f)
del data_dict['TOTAL']
del data_dict['LOCKHART EUGENE E']

Separate emails into to, from, or all categories

In [3]:
labels, to_text, from_text, all_text, emails = zip(*label_email_text)
print(len(to_text), len(from_text), len(all_text), len(emails))

(90, 90, 90, 90)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')

from_array = vectorizer.fit_transform(from_text)
selector = SelectPercentile(f_classif, percentile=10)
sel_from_array = selector.fit_transform(from_array, labels)
sel_idxs = selector.get_support(indices=True)

Create a list with the selected words used as keys in the vectorizer

In [9]:
vocab = vectorizer.vocabulary_

print 'Vocab dictionary before selection: ', len(vocab),\
      'compare to: ', from_array.shape[1] 
sel_vocab = {key: item for key, item in vocab.items() if item in sel_idxs}

print 'Vocab dictionary AFTER selection: ', len(sel_vocab),\
      'compare to: ', sel_from_array.shape[1]

vocab_list = [[key, idx] for key, idx in vocab.items()]
keys, idxs = zip(*vocab_list)
keys, idxs = np.array(keys), np.array(idxs)
keys = keys[np.argsort(idxs)]
sel_vocab_list = keys[selector.get_support()]

print 'Length of list with selected words:',\
      len(sel_vocab_list), 'compare with:', sel_from_array.shape[1]

Vocab dictionary before selection:  42115 compare to:  42115
Vocab dictionary AFTER selection:  4212 compare to:  4212
Length of list with selected words: 4212 compare with: 4212


Figure out what names from the original dataset need to be added to `sel_from_array`

In [11]:
names_to_emails =\
[(key, item['email_address']) for key, item in data_dict.items()]

In [13]:
# Out of those 144 names, how many are in the from_array?
in_vector = [tup[1] for tup in names_to_emails if tup[1] in emails]
print len(in_vector)

86


In [14]:
# There are 4 emails missing, find out what they are
out = [email for email in emails if email not in in_vector]
print(out)

['m..forney@enron.com', 'tim.despain@enron.com', 'larry.lawyer@enron.com', 'jeff.richter@enron.com']


In [15]:
# are they pois?
[tup[0] for tup in label_email_text if tup[4] in out]

[True, True, True, True]

All values not in original dataset are POIs so I will include them in new dictionary.

In [20]:
vector_only_names_emails = (('FORNEY M', 'm..forney@enron.com'),
                            ('DESPAIN TIM', 'tim.despain@enron.com'),
                            ('LAWYER LARRY', 'larry.lawyer@enron.com'),
                            ('RICHTER JEFF', 'jeff.richter@enron.com'))
print(len(names_to_emails))
names_to_emails.extend(vector_only_names_emails)
print(len(names_to_emails))

144
148


Create lists with all the names, labels, and emails in the vector array as well as the missing ones from the original `data_dict`

In [17]:
data_only_names_emails = [tup for tup in names_to_emails if tup[1] not in emails]
print len(data_only_names_emails), 'compare to: ', len(data_dict) - len(in_vector)

 58 compare to:  58


In [21]:
data_names, data_emails =zip(*data_only_names_emails)
all_emails = list(emails) + list(data_emails)
print len(all_emails), 'compare to: ', len(names_to_emails)

148 compare to:  148


In [46]:
vector_names = []
for email in emails:
    vector_names.extend([tup[0] for tup in names_to_emails if tup[1] == email])
all_names = vector_names + list(data_names)

print 'length of vector_names:', len(vector_names),\
      '\nlast vector name and email:', vector_names[89], emails [89],\
      '\nlength all names:', len(all_names), 'compare to:', len(all_emails)
    

length of vector_names: 90 
last vector name and email: MCCARTY DANNY J danny.mccarty@enron.com 
length all names: 148 compare to: 148


In [34]:
data_only_labels = [data_dict[name]['poi'] for name in data_names]
all_labels = list(labels) + data_only_labels
print len(all_labels), 'compare to:', len(all_emails)

148 compare to: 148


Add the missing `data_dict` samples and impute values to them.

In [27]:
to_append = np.zeros((len(data_emails), sel_from_array.shape[1]))
to_append.fill(np.nan)
print 'original array shape:', sel_from_array.shape,\
      '\narray to append shape:', to_append.shape

original array shape: (90, 4212) 
array to append shape: (58, 4212)


In [28]:
new_array = np.concatenate((sel_from_array.toarray(), to_append), axis=0)
print 'Shape of merged array:', new_array.shape

Shape of merged array: (148, 4212)


In [32]:
from sklearn.preprocessing import Imputer

imp = Imputer(strategy='most_frequent')
imp_new_array = imp.fit_transform(new_array)
print 'Shape of array with imputed values:', imp_new_array.shape,\
      'compare with:', new_array.shape

Shape of array with imputed values: (148, 4212) compare with: (148, 4212)


In [36]:
from sklearn.model_selection import train_test_split

rdm = 200

train_features_sel, test_features_sel, train_labels_sel, test_labels_sel =\
train_test_split(sel_from_array.toarray(), labels, test_size=0.4, random_state=rdm)

train_features, test_features, train_labels, test_labels =\
train_test_split(imp_new_array, all_labels, test_size=0.4, random_state=rdm)

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

print 'Original: '
clf = GaussianNB()
clf.fit(train_features_sel, train_labels_sel)
pred = clf.predict(test_features_sel)
print(classification_report(test_labels_sel, pred))

print 'Imputed with additions: '
clf = GaussianNB()
clf.fit(train_features, train_labels)
pred = clf.predict(test_features)
print(classification_report(test_labels, pred))

Original: 
             precision    recall  f1-score   support

      False       0.92      1.00      0.96        33
       True       0.00      0.00      0.00         3

avg / total       0.84      0.92      0.88        36

Imputed with additions: 
             precision    recall  f1-score   support

      False       0.90      1.00      0.95        53
       True       1.00      0.14      0.25         7

avg / total       0.91      0.90      0.87        60



Imputing does not seem to hurt the scores


Create the dictionary from the merged array

In [41]:
vector_dict = {}
for ie, email in enumerate(all_emails):
    for tup in names_to_emails:
        if tup[1] == email:
            name = tup[0]
            vector_dict[name]  = {}
            vector_dict[name]['poi'] = all_labels[ie]
            for iw, word in enumerate(sel_vocab_list):
                vector_dict[name][word] = float(imp_new_array[ie,iw])

In [42]:
for key, item in vector_dict.items():
    good = True
    if len(item) != 4213:
        good = False
print 'All internal keys add up?', good

All internal keys add up? True


Check that when imported back the dictionary data is the same as the source data

In [44]:
from tools.feature_format import featureFormat

sel_vocab_list = list(sel_vocab_list)
sel_vocab_list.insert(0, 'poi')

print len(vector_dict), len(sel_vocab_list)
data = featureFormat(vector_dict, sel_vocab_list,
                     remove_NaN=True, keep_keys=True, remove_all_zeroes=False)
keys, labels, features = data[:,0], data[:,1].astype(float), data[:,2:].astype(float)
print keys.shape, labels.shape, features.shape

148 4213
(148,) (148,) (148, 4212)


In [67]:
# Organize the pre_dict data and post_dict data the same way so that they can be compared

all_names = np.array(all_names)
all_to_sort = np.argsort(all_names)
all_to_sort
sort_all_names = all_names[all_to_sort]
sort_all_labels = np.array(all_labels).astype(np.int8)[all_to_sort]
sort_imp_new_array = imp_new_array.astype(np.float32)[all_to_sort,:] 

new_to_sort = np.argsort(keys)
sort_keys = keys[new_to_sort]
sort_labels = labels.astype(np.int8)[new_to_sort]
sort_features = features.astype(np.float32)[new_to_sort]

print 'names equal:', np.array_equal(sort_all_names, sort_keys)
print 'labels equal:', np.array_equal(sort_all_labels, sort_labels)
print 'features equal:', np.array_equal(sort_imp_new_array, sort_features)

names equal: True
labels equal: True
features equal: False


In [82]:
np.where(sort_imp_new_array != sort_features)

(array([19]), array([2142]))

In [87]:
original = sort_features[np.where(sort_imp_new_array != sort_features)] 
new = sort_imp_new_array[np.where(sort_imp_new_array != sort_features)]
print original , new, 'are equal:?', original == new

[ 0.02454611] [ 0.02454611] are equal:? [False]


The difference between the two arrays seems to be a rounding error. 

Will save the `vector_dict` and its keys (the keys as `vector_names`

In [89]:
import pickle
import os

HOME_PATH = os.path.expanduser('~')
DATA_PATH = os.path.join(HOME_PATH, 'Desktop', 'raw_data', 'ml')
vector_path = os.path.join(DATA_PATH, 'vector_dict.pkl')
name_path = os.path.join(DATA_PATH, 'vector_names.pkl')

with open(vector_path, 'wb') as f:
    pickle.dump(vector_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(name_path, 'wb') as f:
    pickle.dump(sel_vocab_list, f, protocol=pickle.HIGHEST_PROTOCOL)