# Tf-Idf feature extraction

With the data organized into a dictionary extract text features based on term-frequency times inverse document-frequency (Tf-Idf) vectorization

Load dictionary created in [email_features](email_features.ipynb) with email archive organized by email as `full_text_dict`

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

HOME_PATH = os.path.expanduser('~')
DATA_PATH = os.path.join(HOME_PATH, 'Desktop', 'raw_data', 'ml')

full_text_path = os.path.join(DATA_PATH, 'full_text_dict.pkl')
with open(full_text_path, 'rb') as f:
    full_text_dict = pickle.load(f)

In [2]:
def email_list_and_labels(word_dict):
    """
    Consolidates all the emails into a single string that can be used for 
    TfIdf vectorization
    
    Returns a list of dictos:
    {email: email
    poi: True/False
    to_text: all words in emails to this email
    from_text: all words in emails from this emails}
    """
    dicto_list = []
    
    for key in word_dict:
        
        dicto = {'email': key, 'poi': word_dict[key]['poi']}
        
        to_compilation = []
        from_compilation = []
        
        if 'to' in word_dict[key]:
            for temp_d in word_dict[key]['to']:
                to_compilation.append(temp_d['stemmed'])
                
        if 'from' in word_dict[key]:
            for temp_d in word_dict[key]['from']:
                from_compilation.append(temp_d['stemmed'])
        
        to_compilation = ''.join(to_compilation)
        from_compilation = ''.join(from_compilation)
        
        dicto['to_text'] = to_compilation
        dicto['from_text'] = from_compilation
        
        dicto_list.append(dicto)
        
        print '.',

    return dicto_list

In [3]:
text_dicts = email_list_and_labels(full_text_dict)

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


Separate emails into to, from, or all categories

In [4]:
labels_emails = [(dicto['poi'], 
                  dicto['to_text'], 
                  dicto['from_text'], 
                  dicto['to_text'] + ' ' + dicto['from_text'],
                  dicto['email'])
                 for dicto in text_dicts]

In [5]:
labels, to_text, from_text, all_text, emails = zip(*labels_emails)

In [6]:
print(len(to_text), len(from_text), len(all_text), len(emails))

(90, 90, 90, 90)


Perform the actual vectorization

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')

from_array = vectorizer.fit_transform(from_text)
to_array = vectorizer.fit_transform(to_text)
all_array = vectorizer.fit_transform(all_text)
print('From array shape: {}, To array shape: {}, All array shape: {}' 
      .format(from_array.shape, to_array.shape, all_array.shape))

From array shape: (90, 42115), To array shape: (90, 62495), All array shape: (90, 74006)


In [9]:
from sklearn.model_selection import train_test_split

train_from_features, test_from_features, train_from_labels, test_from_labels =\
train_test_split(from_array, labels, test_size=0.4, random_state=30)

train_to_features, test_to_features, train_to_labels, test_to_labels =\
train_test_split(to_array, labels, test_size=0.4, random_state=30)

train_all_features, test_all_features, train_all_labels, test_all_labels =\
train_test_split(all_array, labels, test_size=0.4, random_state=30)

Test the word vectors using Naive Bayes. The testing compares vectors using words from all messages to and from an email with those just from an email. It also compares different selection methods.

In [29]:
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, mutual_info_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

def tester(percent, train_features, test_features, train_labels, test_labels):
    scr_list = [f_classif, chi2] #, mutual_info_classif]
    clf = GaussianNB()
    for scr in scr_list:
        print(scr)
        selector = SelectPercentile(scr, percentile=percent)
        try:
            train_features_sel = selector.fit_transform(train_features, train_labels).toarray()
            test_features_sel = selector.transform(test_features).toarray()
        except AttributeError:
            train_features_sel = selector.fit_transform(train_features, train_labels)
            test_features_sel = selector.transform(test_features)
        clf.fit(train_features_sel, train_labels)
        pred = clf.predict(test_features_sel)
        print(classification_report(test_labels, pred))

In [41]:
print '10% FROM:'
tester(10, train_from_features, test_from_features, train_from_labels, test_from_labels)
print '#########'
print '10% TO:'
tester(10, train_to_features, test_to_features, train_to_labels, test_to_labels)
print '#########'
print '10% ALL:'
tester(10, train_all_features, test_all_features, train_all_labels, test_all_labels)
print '#########'
print '5% FROM:'
tester(5, train_from_features, test_from_features, train_from_labels, test_from_labels)
print '#########'
print '5% TO:'
tester(5, train_to_features, test_to_features, train_to_labels, test_to_labels)
print '#########'
print '5% ALL:'
tester(5, train_all_features, test_all_features, train_all_labels, test_all_labels)
print '#########'

10% FROM:
<function f_classif at 0x1122bc9b0>
             precision    recall  f1-score   support

      False       0.85      0.79      0.81        28
       True       0.40      0.50      0.44         8

avg / total       0.75      0.72      0.73        36

<function chi2 at 0x1122bc6e0>
             precision    recall  f1-score   support

      False       0.81      0.89      0.85        28
       True       0.40      0.25      0.31         8

avg / total       0.72      0.75      0.73        36

<function mutual_info_classif at 0x11251a500>
             precision    recall  f1-score   support

      False       0.81      0.79      0.80        28
       True       0.33      0.38      0.35         8

avg / total       0.71      0.69      0.70        36

#########
10% TO:
<function f_classif at 0x1122bc9b0>
             precision    recall  f1-score   support

      False       1.00      0.21      0.35        28
       True       0.27      1.00      0.42         8

avg / total      

From words combined by selection using f_classif yields the f1 score for the True classification: 0.44. Will use this data for the full analysis.

In [132]:
label_email_text_path = os.path.join(DATA_PATH, 'label_email_text.pkl')

with open(label_email_text_path, 'wb') as f:
    pickle.dump(labels_emails, f, protocol=pickle.HIGHEST_PROTOCOL)