In [None]:
"""
Principal Component Analysis (PCA) Implementation
Medium
Machine Learning

Write a Python function that performs Principal Component Analysis (PCA) from scratch. The function should take a 2D NumPy array as input, where each row represents a data sample and each column represents a feature. The function should standardize the dataset, compute the covariance matrix, find the eigenvalues and eigenvectors, and return the principal components (the eigenvectors corresponding to the largest eigenvalues). The function should also take an integer k as input, representing the number of principal components to return.

Example:
Input:
data = np.array([[1, 2], [3, 4], [5, 6]]), k = 1
Output:
[[0.7071], [0.7071]]
Reasoning:
After standardizing the data and computing the covariance matrix, the eigenvalues and eigenvectors are calculated. The largest eigenvalue's corresponding eigenvector is returned as the principal component, rounded to four decimal places.
"""

In [1]:
import numpy as np

def pca(data: np.ndarray, k: int) -> np.ndarray:

    X = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=0)
    cov_matrix = np.cov(X, rowvar=False)
    eigvals, eigvecs = np.linalg.eig(cov_matrix)
    sorted_indices = np.argsort(eigvals)[::-1]
    eigvecs = eigvecs[:, sorted_indices]
    principal_components = eigvecs[:, :k]
    
    return np.round(principal_components, 4)

In [2]:
data = np.array([[1, 2], [3, 4], [5, 6]])
k = 1
principal_components = pca(data, k)
print(principal_components)

[[0.7071]
 [0.7071]]
