In [1]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
    
import numpy as np
from time import time
from email_preprocess import preprocess
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
# features_train, features_test, labels_train, labels_test = preprocess()



## training the model with all the data, it takes a long time

In [2]:
features_train, features_test, labels_train, labels_test = preprocess()
clf = SVC(kernel='linear')
t0 = time()
clf.fit(features_train, labels_train)
print('training time: {:.3f}s'.format(time() - t0))
t1 = time()
pred = clf.predict(features_test)
print('predicting time: {:.3f}s'.format(time() - t1))
print(accuracy_score(pred, labels_test))

training time: 310.614s
predicting time: 26.822s
0.984072810011


## training the model in a smaller dataset

In [3]:
features_train, features_test, labels_train, labels_test = preprocess()
features_train = features_train[:len(labels_train)//100]
labels_train = labels_train[:len(labels_train)//100]
print('1% of the original data: {} data traing samples'.format(len(labels_train)))
clf = SVC(kernel='linear')
t0 = time()
clf.fit(features_train, labels_train)
print('training time: {:.3f}s'.format(time() - t0))
t1 = time()
pred = clf.predict(features_test)
print('predicting time: {:.3f}s'.format(time() - t1))
print(accuracy_score(pred, labels_test))

1% of the original data: 158 data traing samples
training time: 0.136s
predicting time: 1.498s
0.884527872582


## rbf kernel, small dataset

In [4]:
features_train, features_test, labels_train, labels_test = preprocess()
features_train = features_train[:len(labels_train)//100]
labels_train = labels_train[:len(labels_train)//100]
print('1% of the original data: {} data traing samples\n'.format(len(labels_train)))
for c in [10, 100, 1000, 10000]:
    clf = SVC(kernel='rbf', C=c)
    print('parameter C: {}'.format(c))
    t0 = time()
    clf.fit(features_train, labels_train)
    t1 = time()
    print('training time: {:.3f}s'.format(t1- t0))
    t0 = time()
    pred_train = clf.predict(features_train)
    t1 = time()
    print('predicting time on training samples: {:.3f}s'.format(t1 - t0))
    print('accuracy of training data', accuracy_score(pred_train, labels_train))
    t0 = time()
    pred_test = clf.predict(features_test)
    t1 = time()
    print('predicting time on test samples: {:.3f}s'.format(t1 - t0))
    print('accuracy of test data', accuracy_score(pred_test, labels_test), '\n')

1% of the original data: 158 data traing samples

parameter C: 10
training time: 0.151s
predicting time on training samples: 0.152s
accuracy of training data 0.677215189873
predicting time on test samples: 1.666s
accuracy of test data 0.616040955631 

parameter C: 100
training time: 0.151s
predicting time on training samples: 0.153s
accuracy of training data 0.677215189873
predicting time on test samples: 1.658s
accuracy of test data 0.616040955631 

parameter C: 1000
training time: 0.145s
predicting time on training samples: 0.148s
accuracy of training data 0.962025316456
predicting time on test samples: 1.555s
accuracy of test data 0.821387940842 

parameter C: 10000
training time: 0.141s
predicting time on training samples: 0.124s
accuracy of training data 1.0
predicting time on test samples: 1.328s
accuracy of test data 0.892491467577 



## rbf kernel, C=10000, all training samples

In [5]:
features_train, features_test, labels_train, labels_test = preprocess()
print('{} training samples'.format(len(features_train)))
clf = SVC(kernel='rbf', C=10000)
t0 = time()
clf.fit(features_train, labels_train)
t1 = time()
print('training time: {:.3f}s'.format(t1- t0))
t0 = time()
pred_test = clf.predict(features_test)
t1 = time()
print('predicting time on test samples: {:.3f}s'.format(t1 - t0))
print('accuracy of test data', accuracy_score(pred_test, labels_test))

15820 training samples
training time: 168.316s
predicting time on test samples: 16.889s
accuracy of test data 0.990898748578


## the predicting outcome of index 10, 26, 50 in test data

In [6]:
print(pred_test[np.array([10, 26, 50])])

[1 0 1]


## how many are predicted to be in the Chris(1) class

In [7]:
print(pred_test.sum())

877
