In [1]:
import numpy as np
import time

from sklearn import decomposition, datasets, linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
breast_cancer = datasets.load_breast_cancer()
x = breast_cancer.data
y = breast_cancer.target

In [3]:
x.shape

(569, 30)

In [4]:
## Feature Scaling on x
sc = StandardScaler()
x_std = sc.fit_transform(x)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_std, y, random_state = 0)

# Finding Optimal K

In [6]:
pca = decomposition.PCA()
pca.fit_transform(x_train)
pca.explained_variance_

array([  1.30274684e+01,   5.81556555e+00,   2.85848795e+00,
         1.91901713e+00,   1.70021491e+00,   1.20663908e+00,
         6.53337152e-01,   4.26738474e-01,   4.26450541e-01,
         3.45589858e-01,   3.08054910e-01,   2.56054468e-01,
         2.28152003e-01,   1.43262745e-01,   9.26283031e-02,
         7.80260477e-02,   6.13812037e-02,   5.26182531e-02,
         4.50933578e-02,   3.08275366e-02,   3.03277956e-02,
         2.51390631e-02,   2.12226717e-02,   1.77427715e-02,
         1.63886382e-02,   7.83681541e-03,   6.61084728e-03,
         1.45257891e-03,   7.98794510e-04,   1.11908784e-04])

In [7]:
#Calculating optimal k to have 95% (say) variance 

total = sum(pca.explained_variance_)
k = 0
current_variance = 0

while(current_variance/total < 0.95):
    current_variance += pca.explained_variance_[k]
    k = k+1
k

10

In [8]:
pca = decomposition.PCA(n_components=k)

x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)


# Testing score and time on original data

In [9]:

clf =  linear_model.LogisticRegression()

start = time.time()

clf.fit(x_train, y_train)

ending = time.time()
time_taken = ending - start

print(time_taken)
print(clf.score(x_test, y_test))

0.010612249374389648
0.965034965035


# Testing score and time on PCA applied data

In [10]:


clf =  linear_model.LogisticRegression()

start = time.time()

clf.fit(x_train_pca, y_train)

ending = time.time()
time_taken = ending - start

print(time_taken)
print(clf.score(x_test_pca, y_test))

0.0025768280029296875
0.958041958042


PCA will perform better ie. take less time, with decent amount of accuracy in substantionally larger datasets.
In smaller datasets difference is very less.

In [11]:
## These are the eigen values and tell which vectors are more important
# Eigenvectors represent the dimensions and eigenvalues are directly proportional to the co-variance in that direction.
pca.explained_variance_

array([ 13.02746837,   5.81556555,   2.85848795,   1.91901713,
         1.70021491,   1.20663908,   0.65333715,   0.42673847,
         0.42645054,   0.34558986])