<a href="https://colab.research.google.com/github/yuvalofek/PCA/blob/main/Testing_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Performing PCA on random data

Help from:
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html


In [196]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Generating sample data to perform PCA on
- 20 dim noise with different scaling 

In [197]:
n_samples = 500
dim = 20
n_components = 2

In [198]:
# Genrerate data
noise = np.random.randn(n_samples, dim)
scaling = np.random.randint(1, 100, dim)
input = (noise*scaling)

In [199]:
print(scaling)

[26 84  5 91 51 69 89 91  8 58 39 59 84 79 28 57 13 97 72 68]


In [200]:
## Standardize input:
sc = StandardScaler()
input = sc.fit_transform(input)

## Performing PCA

### Using sklearn

In [201]:
box_pca = PCA(n_components=n_components)
box_pca.fit(input)
print('Explained Variance Percent:', (box_pca.explained_variance_ratio_*100).round(2).tolist())
print('SVs:', box_pca.singular_values_.round(3).tolist())

Explained Variance Percent: [6.85, 6.46]
SVs: [26.167, 25.415]


In [202]:
output_box = box_pca.transform(input)

### By ourselves

In [203]:
# first step is to calculate the covariance matrix 
covar = np.cov(input.T)
e_val, e_vect = np.linalg.eig(covar)

#### Explained variance 

$\frac{\lambda_i} {\sum_j \lambda_j}$

In [204]:
var_explained = e_val/e_val.sum()
print('Explained Variance Percent:', sorted((var_explained*100).round(2), reverse=True))

Explained Variance Percent: [6.85, 6.46, 6.25, 6.01, 5.84, 5.75, 5.51, 5.43, 5.26, 5.09, 4.99, 4.7, 4.54, 4.32, 4.31, 3.97, 3.92, 3.75, 3.61, 3.44]


#### Singular values

$\sigma_i = \sqrt{\lambda_i*(n_{samples}-1)}$

In [205]:
print('SVs:', np.sqrt(e_val*(n_samples-1)).round(3).tolist())

SVs: [26.167, 25.415, 18.536, 24.993, 19.011, 24.522, 19.376, 19.803, 19.926, 24.175, 23.974, 20.76, 20.784, 21.297, 21.68, 23.479, 23.299, 22.928, 22.569, 22.336]


We did it! We match the results from sklearn!!!

In [206]:
# How we could tranform the data
np.dot(input, e_vect.T[:n_components].T)

array([[-2.92044138e-01, -1.26033749e+00],
       [-8.82693807e-01, -1.29739241e+00],
       [ 3.12237625e-01, -4.70932819e-01],
       [ 4.43182706e-01,  5.26715268e-01],
       [-8.45949354e-01, -1.34324394e-01],
       [ 1.44863382e+00,  2.25664816e-01],
       [ 1.42231968e+00, -1.55796691e-01],
       [-2.11761267e-01, -8.49002951e-03],
       [-1.83393464e+00, -1.74384410e+00],
       [-1.37943977e+00, -1.95028504e-01],
       [ 6.03310913e-01,  2.88816515e-02],
       [-6.89545658e-01,  5.55981860e-01],
       [-4.04783480e-01, -1.50806847e+00],
       [-1.29581905e+00, -2.40985018e-01],
       [ 7.97974756e-01,  7.14571838e-03],
       [ 4.83726470e-01, -1.79586161e+00],
       [-1.03997334e+00, -3.46093473e-01],
       [-2.19245384e-01, -2.63675989e-01],
       [-1.38740539e+00, -2.74103312e-01],
       [-1.95217524e-01,  1.00282477e+00],
       [-8.34747539e-01,  6.36996513e-01],
       [ 4.56705423e-01, -2.01919714e+00],
       [ 6.29016412e-01,  2.94365741e-01],
       [ 3.