In [39]:
#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )


In [40]:

### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn import model_selection
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(word_data, authors, test_size=0.1, random_state=42)


In [41]:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()

In [42]:
features_train

<15820x37861 sparse matrix of type '<type 'numpy.float64'>'
	with 936927 stored elements in Compressed Sparse Row format>

In [43]:
features_train.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [44]:
### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

In [45]:
features_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [46]:
labels_train

[0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0]

In [47]:
### your code goes here
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [48]:
clf.score(features_train, labels_train)

1.0

In [49]:
clf.score(features_test, labels_test)

0.81683731513083047

In [26]:
feature_names = vectorizer.get_feature_names()

idx = -1
for fi in clf.feature_importances_:
    idx += 1
    if fi >= 0.01:
        print 'feature', fi, idx, feature_names[idx]
        

feature 0.0749500333111 19671 fyi
feature 0.0263157894737 24321 leav
feature 0.134028294862 33201 smith
feature 0.764705882353 33614 sshacklensf


In [38]:
feature_names = vectorizer.get_feature_names()

idx = -1
for fi in clf.feature_importances_:
    idx += 1
    if fi >= 0.01:
        print 'feature', fi, idx, feature_names[idx]
        

feature 0.162601626016 8674 62502pst
feature 0.0506072874494 14337 cgerman
feature 0.666666666667 14343 cgermannsf
feature 0.093808630394 16268 deal
feature 0.0263157894737 35399 trade


In [50]:
feature_names = vectorizer.get_feature_names()

idx = -1
for fi in clf.feature_importances_:
    idx += 1
    if fi >= 0.01:
        print 'feature', fi, idx, feature_names[idx]
        

feature 0.105378579003 11975 attach
feature 0.0262801932367 13080 bond
feature 0.0137142857143 15434 copi
feature 0.0474074074074 16267 deal
feature 0.0426666666667 18095 enron
feature 0.186927243449 18849 fax
feature 0.02 19196 floor
feature 0.363636363636 21323 houectect
feature 0.012 21327 hour
feature 0.0840692099229 22546 isda
feature 0.0248101945003 24320 leav
feature 0.0255293305728 25675 master
feature 0.0475805258904 29690 pleas
