In [1]:
import pickle
import cPickle
import numpy

from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif




In [2]:
authors_file="../tools/email_authors.pkl"
### the words (features) and authors (labels), already largely preprocessed
authors_file_handler = open(authors_file, "r")
authors = pickle.load(authors_file_handler)
authors_file_handler.close()

## get the frequency per author
from collections import Counter 
def group_list(lst): 
    return list(zip(Counter(lst).keys(), Counter(lst).values())) 
print(group_list(authors)) 

[(0, 8777), (1, 8801)]


In [3]:
words_file = "../tools/word_data.pkl"
words_file_handler = open(words_file, "r")
word_data = cPickle.load(words_file_handler)
words_file_handler.close()

word_data

[' sbaile2 nonprivilegedpst susan pleas send the forego list to richard thank   enron wholesal servic 1400 smith street eb3801a houston tx 77002 ph 713 8535620 fax 713 6463490',
 ' sbaile2 nonprivilegedpst 1 txu energi trade compani 2 bp capit energi fund lp may be subject to mutual termin 2 nobl gas market inc 3 puget sound energi inc 4 virginia power energi market inc 5 t boon picken may be subject to mutual termin 5 neumin product co 6 sodra skogsagarna ek for probabl an ectric counterparti 6 texaco natur gas inc may be book incorrect for texaco inc financi trade 7 ace capit re oversea ltd 8 nevada power compani 9 prior energi corpor 10 select energi inc origin messag from tweed sheila sent thursday januari 31 2002 310 pm to   subject pleas send me the name of the 10 counterparti that we are evalu thank',
 ' sbaile2 nonprivilegedpst all here the second tier of counterparti to add to the data retriev list 11 medianew group inc 12 macromedia incorpor 13 british airway plc 14 merc irri

In [4]:
### test_size is the percentage of events assigned to the test set
### (remainder go into training)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

In [5]:
### text vectorization--go from strings to lists of numbers
### tf-idf features: 
# - Term Frequency(TF) indicates the frequency of each of the words present in the document or dataset. 
# - Inverse Document Frequency(IDF) tells us how important the word is to the document
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')

# Learn vocabulary and idf, return term-document matrix.
# This is equivalent to fit followed by transform, but more efficiently implemented.
features_train_transformed = vectorizer.fit_transform(features_train)

# Transform documents to document-term matrix.
# Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
features_test_transformed  = vectorizer.transform(features_test)

In [6]:
### feature selection, because text is super high dimensional and 
### can be really computationally chewy as a result

# SelectPercentile: Select features according to a percentile of the highest scores.
selector = SelectPercentile(f_classif, percentile=10)

# Run score function on (X, y) and get the appropriate features.
selector.fit(features_train_transformed, labels_train)

# transform: Reduce "X" to the selected features.
features_train_transformed = selector.transform(features_train_transformed).toarray()
features_test_transformed  = selector.transform(features_test_transformed).toarray()

595