In [1]:
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib import rcParams

from sklearn.datasets import load_breast_cancer, load_digits, fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA

from sklearn.svm import SVC

rcParams['figure.figsize'] = (10,5)

### Load Datasets

In [2]:
cancer_X, cancer_y = load_breast_cancer(return_X_y=True)
digits_X, digits_y = load_digits(return_X_y=True)

faces = fetch_olivetti_faces()
faces_X = faces['data']
faces_y = faces['target']

### Create Train/Test Splits

In [3]:
digits_Xtrain, digits_Xtest, digits_ytrain, digits_ytest = train_test_split(digits_X, digits_y, train_size=.7)
cancer_Xtrain, cancer_Xtest, cancer_ytrain, cancer_ytest = train_test_split(cancer_X, cancer_y, train_size=.7)
faces_Xtrain, faces_Xtest, faces_ytrain, faces_ytest = train_test_split(faces_X, faces_y, train_size=.7)

## Breast Cancer

Since there are different units across the features, it is most appropriate to use the 
Random Forest feature importance dimension reduction.

In [9]:
before = time.time()
rf = RandomForestClassifier()
cancer_acc = rf.fit(cancer_Xtrain, cancer_ytrain).score(cancer_Xtest, cancer_ytest)
after = time.time()

cancer_timing = after - before

top_features = np.argsort(rf.feature_importances_)[:rf.feature_importances_.size//4]
reduced_cancer_Xtrain = cancer_Xtrain[:,top_features]
reduced_cancer_Xtest = cancer_Xtest[:,top_features]

print "Top Features:"
for l in load_breast_cancer()['feature_names'][top_features]:
    print '\t', l
print ""
    
reduced_cancer_acc = RandomForestClassifier().fit(reduced_cancer_Xtrain, cancer_ytrain).score(reduced_cancer_Xtest, cancer_ytest)
after = time.time()
reduced_cancer_timing = after - before
                                                                                              
print "{:<10} {:<10} {:<10}".format("", "Accuracy", "Timings")
print "{:<10} {:<10.3%} {:<10.4}".format("Original", cancer_acc, cancer_timing)
print "{:<10} {:<10.3%} {:<10.4}".format("Reduced", reduced_cancer_acc, reduced_cancer_timing)                                                                                        

Top Features:
	fractal dimension error
	texture error
	perimeter error
	mean symmetry
	compactness error
	concavity error
	radius error

           Accuracy   Timings   
Original   94.152%    0.05897   
Reduced    84.211%    0.12      


For a dataset this small, we actually lose accuracy *and* execution time. The execution time is nearly double for the reduced method because we have to run the original model first to be able to determine the feature importances. For a especially large datasets, we would choose a small subset of the data to choose calculate the feature importances before running the random forest on all of the reduced data.

## Digits
Since all of the features have pixel values as a their units, it is feasible to use PCA.

In [12]:
before = time.time()
digits_acc = (SVC(kernel='poly').fit(digits_Xtrain, digits_ytrain).predict(digits_Xtest) == digits_ytest).mean()
after = time.time()

digits_timing = after-before

before = time.time()
pca = PCA(digits_Xtrain.shape[1]//4)
reduced_digits_Xtrain = pca.fit_transform(digits_Xtrain)
reduced_digits_Xtest = pca.transform(digits_Xtest)

reduced_digits_acc = SVC(kernel='poly').fit(reduced_digits_Xtrain, digits_ytrain).score(reduced_digits_Xtest, digits_ytest)
after = time.time()

reduced_digits_timing = after-before

print "{:<10} {:<10} {:<10}".format("", "Accuracy", "Timings")
print "{:<10} {:<10.3%} {:<10.4}".format("Original", digits_acc, digits_timing)
print "{:<10} {:<10.3%} {:<10.4}".format("Reduced", reduced_digits_acc, reduced_digits_timing)

           Accuracy   Timings   
Original   99.259%    0.05406   
Reduced    98.704%    0.04744   


## Faces
As with the digits, since all of the features have pixel values as a their units, it is feasible to use PCA.

In [22]:
before = time.time()
faces_acc = SVC(kernel='poly', C=10, gamma=.01, coef0=1).fit(faces_Xtrain, faces_ytrain).score(faces_Xtest, faces_ytest)
after = time.time()
faces_timing = after - before

before = time.time()
pca = PCA(faces_Xtrain.shape[1]//4)
reduced_faces_Xtrain = pca.fit_transform(faces_Xtrain)
reduced_faces_Xtest = pca.transform(faces_Xtest)

reduced_faces_acc = SVC(kernel='poly', C=10, gamma=.01, coef0=1).fit(reduced_faces_Xtrain, faces_ytrain).score(reduced_faces_Xtest, faces_ytest)
after = time.time()
reduced_faces_timing = after - before

print "{:<10} {:<10} {:<10}".format("", "Accuracy", "Timings")
print "{:<10} {:<10.3%} {:<10.4}".format("Original", faces_acc, faces_timing)
print "{:<10} {:<10.3%} {:<10.4}".format("Reduced", reduced_faces_acc, reduced_faces_timing)                            

           Accuracy   Timings   
Original   95.833%    0.652     
Reduced    95.833%    0.1603    


Notice there was a time improvement in both cases without sacrificing much (if any) accuracy.