# Predicting Breast Cancer Dataset using PCA

* Use PCA to narrow down breast-cancer dataset to two principal components
    - For Visualisation
    - For comparisonin in performance & accuracy compared to direct methods
    
* Use SVM based classifier for creating Malignant/Benign Classifier
    - Grid Search for optimal parameter selection
    - Imapct of PCA on classifier accuracy

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
import matplotlib.pyplot as plt 

%matplotlib notebook 

In [3]:
plt.style.use('seaborn')

In [4]:
from sklearn.datasets import load_breast_cancer

In [5]:
data = load_breast_cancer()

In [6]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [7]:
np.atleast_2d( data['target']).T.shape

(569, 1)

In [8]:
np.c_[data['data'],data['target']].shape

(569, 31)

In [9]:
np.append(data['feature_names'],'target') 

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension', 'target'],
      dtype='<U23')

In [10]:
df = pd.DataFrame(data = np.c_[data['data'],data['target']],
                  columns = np.append(data['feature_names'],'target') 
                 )
df 

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0.0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0.0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0.0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0.0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0.0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0.0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0.0


In [11]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [12]:
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [13]:
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
564    0.0
565    0.0
566    0.0
567    0.0
568    1.0
Name: target, Length: 569, dtype: float64

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
X_norm = StandardScaler().fit_transform(X)
X_norm 

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [16]:
from sklearn.decomposition import PCA

In [17]:
pca = PCA(n_components=6,svd_solver = 'auto')
pca

PCA(n_components=6)

In [18]:
principal_components = pca.fit_transform(X_norm)

In [19]:
y.values.shape

(569,)

In [20]:
pc_df = pd.DataFrame(np.c_[principal_components,y.values],columns=[
    'PC1','PC2','PC3','PC4','PC5','PC6','target'
])
pc_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,target
0,9.192837,1.948583,-1.123166,3.633732,-1.195107,1.411426,0.0
1,2.387802,-3.768172,-0.529293,1.118265,0.621780,0.028650,0.0
2,5.733896,-1.075174,-0.551748,0.912080,-0.177094,0.541456,0.0
3,7.122953,10.275589,-3.232790,0.152546,-2.960880,3.053421,0.0
4,3.935302,-1.948072,1.389767,2.940638,0.546743,-1.226490,0.0
...,...,...,...,...,...,...,...
564,6.439315,-3.576817,2.459486,1.177311,-0.074834,-2.375187,0.0
565,3.793382,-3.584048,2.088476,-2.506028,-0.510722,-0.246712,0.0
566,1.256179,-1.902297,0.562731,-2.089227,1.809993,-0.534448,0.0
567,10.374794,1.672010,-1.877029,-2.356031,-0.033744,0.567941,0.0


In [21]:
ax = pc_df.loc[pc_df['target']==0,['PC1','PC2']].plot(x = 'PC1',y = 'PC2',
                                                    kind = 'scatter',
                                                    c = 'red',
                                                   label = '$y=0$')

<IPython.core.display.Javascript object>

In [22]:
pc_df.loc[pc_df['target']==1,['PC1','PC2']].plot(x = 'PC1',y = 'PC2',
                                                    kind = 'scatter',
                                                    c = 'green',
                                                   label = 'y=1',
                                                  ax = ax)

<AxesSubplot:xlabel='PC1', ylabel='PC2'>

In [23]:
plt.title('Data distribution for $PC_1$ and $PC_2$')
plt.gcf()

<IPython.core.display.Javascript object>

In [24]:
np.arange(pca.n_components+1)

array([0, 1, 2, 3, 4, 5, 6])

In [25]:
plt.figure()
x = np.arange(1,pca.n_components+1)
plt.bar(x,pca.explained_variance_ratio_)

<IPython.core.display.Javascript object>

<BarContainer object of 6 artists>

In [26]:
plt.xlabel('Principal Components')
plt.ylabel('Proportion of $\sigma^2$ explained')

Text(0, 0.5, 'Proportion of $\\sigma^2$ explained')

In [27]:
plt.title('Scree Plot & Cumulative $\sigma^2$',fontsize = 20)

Text(0.5, 1.0, 'Scree Plot & Cumulative $\\sigma^2$')

In [28]:
ax = plt.gca()
ax2 = ax.twinx()

In [29]:
ax2.plot(x,np.cumsum(pca.explained_variance_ratio_),    
        'r-',label = 'cumulative $\sigma^2$')

[<matplotlib.lines.Line2D at 0x222ba8bd448>]

In [30]:
ax2.axes.set_ylabel('Cumulative $\sigma^2$')

Text(0, 0.5, 'Cumulative $\\sigma^2$')

In [31]:
ax2.set_ylim([0,1])

(0.0, 1.0)

In [32]:
ax2.plot(x,[0.7]*len(x),'g--',label='Threshold $\sigma^2 = 0.7$')

[<matplotlib.lines.Line2D at 0x222ba8d7488>]

In [33]:
plt.legend(loc = 'upper right')


<matplotlib.legend.Legend at 0x222ba900b88>

In [34]:
plt.gcf()

<IPython.core.display.Javascript object>

In [35]:
pca.explained_variance_

array([13.30499079,  5.7013746 ,  2.82291016,  1.98412752,  1.65163324,
        1.20948224])

In [36]:
imp_features = pd.DataFrame(pca.components_.T,
                            columns = pc_df.columns[:-1],
                           index = df.columns[:-1])
imp_features

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
mean radius,0.218902,-0.233857,-0.008531,0.041409,0.037786,0.018741
mean texture,0.103725,-0.059706,0.06455,-0.60305,-0.049469,-0.032179
mean perimeter,0.227537,-0.215181,-0.009314,0.041983,0.037375,0.017308
mean area,0.220995,-0.231077,0.0287,0.053434,0.010331,-0.001888
mean smoothness,0.14259,0.186113,-0.104292,0.159383,-0.365089,-0.286375
mean compactness,0.239285,0.151892,-0.074092,0.031795,0.011704,-0.014131
mean concavity,0.2584,0.060165,0.002734,0.019123,0.086375,-0.009344
mean concave points,0.260854,-0.034768,-0.025564,0.065336,-0.043861,-0.05205
mean symmetry,0.138167,0.190349,-0.04024,0.067125,-0.305941,0.356458
mean fractal dimension,0.064363,0.366575,-0.022574,0.048587,-0.044424,-0.119431


In [37]:
imp_features.sort_values(by=['PC1','PC2','PC3'],ascending=False)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
mean concave points,0.260854,-0.034768,-0.025564,0.065336,-0.043861,-0.05205
mean concavity,0.2584,0.060165,0.002734,0.019123,0.086375,-0.009344
worst concave points,0.250886,-0.008257,-0.170344,0.006007,0.043332,-0.030873
mean compactness,0.239285,0.151892,-0.074092,0.031795,0.011704,-0.014131
worst perimeter,0.23664,-0.199878,-0.048547,0.013803,0.007454,0.008501
worst concavity,0.228768,0.097964,-0.173057,-0.073951,0.188519,0.028379
worst radius,0.227997,-0.219866,-0.047507,0.015417,-0.004406,-0.000291
mean perimeter,0.227537,-0.215181,-0.009314,0.041983,0.037375,0.017308
worst area,0.224871,-0.219352,-0.011902,0.025895,-0.027391,-0.025165
mean area,0.220995,-0.231077,0.0287,0.053434,0.010331,-0.001888
