In [1]:
import numpy as np

**STEPS**

Standardize the Dataset: Ensure that each feature has a mean of 0 and a standard deviation of 1.

Compute the Covariance Matrix: Reflects how features vary together.

Find Eigenvalues and Eigenvectors: Solve the characteristic equation for the covariance matrix.

Select Principal Components: Choose eigenvectors (components) with the highest eigenvalues for dimensionality reduction.

In [2]:
data = np.array([[1, 2], [3, 4], [5, 6]])
k = 2

In [3]:
# standardise the dataset

standardised_data = (data - np.mean(data, axis=0))/np.std(data, axis=0)

In [4]:
standardised_data

array([[-1.22474487, -1.22474487],
       [ 0.        ,  0.        ],
       [ 1.22474487,  1.22474487]])

In [5]:
# compute covariance matrix

covariance_matrix = np.cov(standardised_data.T)
print(covariance_matrix)

[[1.5 1.5]
 [1.5 1.5]]


In [9]:
# OR use this

covariance_matrix = np.cov(standardised_data, rowvar=False)
covariance_matrix

array([[1.5, 1.5],
       [1.5, 1.5]])

In [10]:
# compute eigenvalues and eigenvectors

eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print(eigenvalues)
print(eigenvectors)

[3. 0.]
[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]


In [11]:
## sort eigenvalues and eigenvectors

sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]
print(sorted_eigenvalues)
print(sorted_eigenvectors)

[3. 0.]
[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]


In [21]:
np.argsort(eigenvalues)[::-1]

array([0, 1], dtype=int64)

In [12]:
# compute principal components

principal_components = sorted_eigenvectors[:, :k]
print(principal_components)

[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]


In [13]:
# project data onto principal components

projected_data = standardised_data.dot(principal_components)
print(projected_data)

[[-1.73205081  0.        ]
 [ 0.          0.        ]
 [ 1.73205081  0.        ]]


In [14]:
# reconstruct original data

reconstructed_data = projected_data.dot(principal_components.T) + np.mean(data, axis=0)
print(reconstructed_data)

[[1.77525513 2.77525513]
 [3.         4.        ]
 [4.22474487 5.22474487]]


In [17]:
# putting all of this in a function

def pca_implemented(data, k):
    standardised_data = (data - np.mean(data, axis=0))/np.std(data, axis=0)
    covariance_matrix = np.cov(standardised_data, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalues = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]
    principal_components = sorted_eigenvectors[:, :k]
    
    return np.round(principal_components, 4)
    

In [18]:
pca_implemented(data, k)

array([[ 0.7071, -0.7071],
       [ 0.7071,  0.7071]])