<a href="https://colab.research.google.com/github/sushantnair/Principal_Component_Analysis/blob/main/EXPT6_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [2]:
print(f'Principal Component Analysis (PCA)\nStep 1: Dataset\nFeat Val1 Val2 Val3 Val4')
# features = {'x': [4, 8, 13, 7, 8], 'y': [11, 4, 5, 14, 15], 'z': [1, 2, 3, 5, 6], 'a': [7, 8, 70, 80, 9]}
features = {'x': [4, 8, 13, 7], 'y': [11, 4, 5, 14]}
n = len(features)
N = len(features['x'])
for feature, values in features.items():
    print(feature, *["{:>4}".format(val) for val in values])

Principal Component Analysis (PCA)
Step 1: Dataset
Feat Val1 Val2 Val3 Val4
x    4    8   13    7
y   11    4    5   14


In [3]:
print(f'Step 2: Calculate Mean')
mean_dict = {}
for feature, values in features.items():
    sum = 0
    for value in values:
        sum = sum + value
    mean = sum / len(values)
    mean_dict[f'{feature}_bar'] = mean

print(mean_dict)

Step 2: Calculate Mean
{'x_bar': 8.0, 'y_bar': 8.5}


In [4]:
print(f'Step 3: Covariance Matrix')
print(f'{n} features => {n} by {n} matrix')
# Build the matrix
cov_matrix = np.empty((n,n), dtype=object)
for i, feature_i in enumerate(features):
    for j, feature_j in enumerate(features):
        cov_matrix[i,j] = (feature_i, feature_j)

print(cov_matrix)
cov_mat = np.empty((n,n))
for i in range(cov_matrix.shape[0]):
    for j in range(cov_matrix.shape[1]):
        print(cov_matrix[i,j])
        cov_val = 0
        for k in range(N):
            # Works only for two features
            ''' cov_val = cov_val + ((features[f'{cov_mat[i,j][0]}'][k] - mean_dict[f'{cov_mat[i,j][0]}_bar']) * \
                                    (features[f'{cov_mat[i,j][1]}'][k] - mean_dict[f'{cov_mat[i,j][1]}_bar']))     '''

            # Works for 'n' features in compact fashion
            ''' cov_val = cov_val + ((features[list(features.keys())[i]][k] - mean_dict[f'{list(features.keys())[i]}_bar']) * \
                                    (features[list(features.keys())[j]][k] - mean_dict[f'{list(features.keys())[j]}_bar']))     '''

            prod = 1
            for l in range(len(cov_matrix[i,j])):
                prod = prod * (features[f'{cov_matrix[i,j][l]}'][k] - mean_dict[f'{cov_matrix[i,j][l]}_bar'])
            cov_val = cov_val + prod

        cov_mat[i,j] = cov_val / (N - 1)

print(cov_mat)


Step 3: Covariance Matrix
2 features => 2 by 2 matrix
[[('x', 'x') ('x', 'y')]
 [('y', 'x') ('y', 'y')]]
('x', 'x')
('x', 'y')
('y', 'x')
('y', 'y')
[[ 14. -11.]
 [-11.  23.]]


In [5]:
# Check for NaN values
if np.isnan(cov_mat).any():
    print("Array contains NaN values")

# Check for inf values
if np.isinf(cov_mat).any():
    print("Array contains inf values")

In [6]:
print(f'Step 4: Eigenvalue; Eigenvector; Normalized Eigenvector')
print(f'Step 4a: Eigen value for v = 1')
print(cov_mat)
eigenvalues, eigenvectors = np.linalg.eig(cov_mat)
print(f'Eigenvalues are: {eigenvalues}')

# Find the index of the maximum eigenvalue
max_egnvalindx = np.argmax(eigenvalues)

# Get the maximum eigenvalue
max_eigenvalue = eigenvalues[max_egnvalindx]

# Get the corresponding Normalized Eigenvector
max_eigenvector = eigenvectors[:, max_egnvalindx]

print(f'Selected Eigenvalue: {max_eigenvalue}')
print(f'Corresponding Normalized Eigenvector: {max_eigenvector}')

Step 4: Eigenvalue; Eigenvector; Normalized Eigenvector
Step 4a: Eigen value for v = 1
[[ 14. -11.]
 [-11.  23.]]
Eigenvalues are: [ 6.61513568 30.38486432]
Selected Eigenvalue: 30.384864324004713
Corresponding Normalized Eigenvector: [ 0.55738997 -0.83025082]


In [7]:
print(f'Step 5: Derive n to 1 using First Principal Component')
# Reduced Dimension p
p = dict()

for i in range(N):
    horz_mat = [value[i] - mean_dict[f'{feature}_bar'] for feature, value in features.items()]
    horz_mat = np.array(horz_mat)
    print(horz_mat)
    vert_mat = horz_mat.reshape(-1,1)
    print(vert_mat)
    p_val = np.dot(max_eigenvector, vert_mat)
    print(p_val[0])
    p[f'p1{i+1}'] = round(p_val[0], 4)

print(f'Reduced Dimensions: {p}')

Step 5: Derive n to 1 using First Principal Component
[-4.   2.5]
[[-4. ]
 [ 2.5]]
-4.305186922674707
[ 0.  -4.5]
[[ 0. ]
 [-4.5]]
3.7361286866113304
[ 5.  -3.5]
[[ 5. ]
 [-3.5]]
5.692827710560994
[-1.   5.5]
[[-1. ]
 [ 5.5]]
-5.123769474497617
Reduced Dimensions: {'p11': -4.3052, 'p12': 3.7361, 'p13': 5.6928, 'p14': -5.1238}


In [None]:
# references
print(features)
for value in features.values():
    print(value)
for feature, value in features.items():
    print(feature, value)
