In [5]:

import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# load dataset into Pandas DataFrame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])

df.head()


Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:


#Standardize the Data
from sklearn.preprocessing import StandardScaler


features = ['sepal length', 'sepal width', 'petal length', 'petal width']

#separating
x= df.loc[:,features].values

y= df.loc[:,'target'].values

x=StandardScaler().fit_transform(x)

pd.DataFrame(x).describe()




Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,-2.775558e-16,-5.140333e-16,1.154632e-16,9.251859e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.438987,-1.568735,-1.44445
25%,-0.9006812,-0.5877635,-1.227541,-1.181504
50%,-0.05250608,-0.1249576,0.3362659,0.1332259
75%,0.6745011,0.5692513,0.7627586,0.7905908
max,2.492019,3.114684,1.786341,1.710902


In [7]:
x.shape


(150, 4)

In [8]:

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x)
principalDF = pd.DataFrame(data=principalComponents, columns = ['principal component 1', 'principal component 2'])

principalDF.shape



(150, 2)

In [9]:

finalDf = pd.concat([principalDF, df[['target']]], axis = 1)
finalDf


Unnamed: 0,principal component 1,principal component 2,target
0,-2.264542,0.505704,Iris-setosa
1,-2.086426,-0.655405,Iris-setosa
2,-2.367950,-0.318477,Iris-setosa
3,-2.304197,-0.575368,Iris-setosa
4,-2.388777,0.674767,Iris-setosa
5,-2.070537,1.518549,Iris-setosa
6,-2.445711,0.074563,Iris-setosa
7,-2.233842,0.247614,Iris-setosa
8,-2.341958,-1.095146,Iris-setosa
9,-2.188676,-0.448629,Iris-setosa


In [10]:


# information (variance) can be attributed to each of the principal components
# when reducing dimension, you lose some of the variance 
# by using the attribute explained_variance_ration_,you can see that the first principal component contains 72.77% of the variance and the second principal component contains 23.03% of the variance. Together, the two components contain 95.80% of the information.


pca.explained_variance_ratio_



array([ 0.72770452,  0.23030523])

In [11]:

pca.explained_variance_ratio_.sum()


0.95800975361481988

In [12]:


# Example

import pandas as pd

df = pd.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
    header=None, 
    sep=',')

df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']


In [13]:
#  drops the empty line at file-end
df.dropna(how="all", inplace=True)
df.isnull().sum()

sepal_len    0
sepal_wid    0
petal_len    0
petal_wid    0
class        0
dtype: int64

In [14]:


# split data table into data X and class labels y
# to ndarray
X=df.iloc[:,0:4].values
y=df.iloc[:,4].values


In [15]:

from sklearn.preprocessing import StandardScaler
X_std=StandardScaler().fit_transform(X)

X_std.shape

(150, 4)

In [16]:

# Eigndecompoosition 
# Computing Eigenvectors and Eigenvalues

# Covariance Matrix
# the classic approach to PCA is to perform the eigndecomposition on the covariance matrix


In [17]:

import numpy as np
mean_vec = np.mean(X_std, axis = 0)

cov_mat = (X_std - mean_vec).T.dot((X_std-mean_vec))/(X_std.shape[0]-1)

print('Covariance matrix \n%s' %cov_mat)



Covariance matrix 
[[ 1.00671141 -0.11010327  0.87760486  0.82344326]
 [-0.11010327  1.00671141 -0.42333835 -0.358937  ]
 [ 0.87760486 -0.42333835  1.00671141  0.96921855]
 [ 0.82344326 -0.358937    0.96921855  1.00671141]]


In [18]:


#another approach iis just to use numpy cov function
print('Numpy Covariance Matrix: \n%s'%np.cov(X_std.T))




Numpy Covariance Matrix: 
[[ 1.00671141 -0.11010327  0.87760486  0.82344326]
 [-0.11010327  1.00671141 -0.42333835 -0.358937  ]
 [ 0.87760486 -0.42333835  1.00671141  0.96921855]
 [ 0.82344326 -0.358937    0.96921855  1.00671141]]


In [19]:

# Next, we perform an eigen decomposition on the covariance matrix

cov_mat = np.cov(X_std.T)
eig_vals , eig_vecs = np.linalg.eig(cov_mat)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)



Eigenvectors 
[[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
 [-0.26335492 -0.92555649  0.24203288 -0.12413481]
 [ 0.58125401 -0.02109478  0.14089226 -0.80115427]
 [ 0.56561105 -0.06541577  0.6338014   0.52354627]]

Eigenvalues 
[ 2.93035378  0.92740362  0.14834223  0.02074601]


In [20]:

# Correlation Matrix

# . Eigendecomposition of the standardized data based on the correlation matrix:
cor_mat1 = np.corrcoef(X_std.T)

eig_vals, eig_vecs = np.linalg.eig(cor_mat1)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)




Eigenvectors 
[[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
 [-0.26335492 -0.92555649  0.24203288 -0.12413481]
 [ 0.58125401 -0.02109478  0.14089226 -0.80115427]
 [ 0.56561105 -0.06541577  0.6338014   0.52354627]]

Eigenvalues 
[ 2.91081808  0.92122093  0.14735328  0.02060771]


In [21]:

#Eigendecomposition of the raw data based on the correlation matrix:


cor_mat2 = np.corrcoef(X.T)

eig_vals, eig_vecs = np.linalg.eig(cor_mat2)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)


Eigenvectors 
[[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
 [-0.26335492 -0.92555649  0.24203288 -0.12413481]
 [ 0.58125401 -0.02109478  0.14089226 -0.80115427]
 [ 0.56561105 -0.06541577  0.6338014   0.52354627]]

Eigenvalues 
[ 2.91081808  0.92122093  0.14735328  0.02060771]


In [22]:


# Make a list of (eignvalue, eignvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i 
            in range(len(eig_vals))]


# sort the (eigenvalue, eigenvector) tuples from high to low

eig_pairs.sort()
eig_pairs.reverse()


In [23]:


print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])



Eigenvalues in descending order:
2.91081808375
0.921220930707
0.147353278305
0.0206077072356


In [24]:

tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

print(cum_var_exp)



[  72.77045209   95.80097536   99.48480732  100.        ]


In [28]:

import numpy as np

matrix_W = np.hstack((eig_pairs[0][1].reshape(4,1), 
                      eig_pairs[1][1].reshape(4,1)))

print('Matrix W:\n', matrix_W)



Matrix W:
 [[ 0.52237162 -0.37231836]
 [-0.26335492 -0.92555649]
 [ 0.58125401 -0.02109478]
 [ 0.56561105 -0.06541577]]


In [38]:

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components = 2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

print(Y_sklearn.shape)


(150, 2)
