<a href="https://colab.research.google.com/github/tselane2110/ml-algorithms-implemented/blob/main/PCA_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing Principal Component Analysis (PCA)

The steps involved are:

1. Convert the given df into a numpy array (matrix).
2. Perform mean-centralization on the resultant matrix.
3. Create a covariance matrix
4. Calculate the eigenvalue(s) "lambda" by solving the characterisitic polynomial :
                      (A - (lambda * I))
5. Sort the eigenvalues in desecending order. Choose the top 'z' number of eigenvalues where 'z' is the number of reduced dimensions (number of dimensions you wanted to reduced your initial dataset/dataframe into).
5. Find the eigenvector(s) for the chosen 'z' number of eigenvalues by solving:
                      A * V = lambda * V
                 where :
                       A -> covariance matrix
                       V -> eigenvector to find
                  lambda -> eigenvalue for the eigenvector V

7. Find the project matrix Ap as:
                      Ap = A*V
                  where :
                       V -> [V1/||V1|| V2/||V2|| ... Vn/||Vn||]
               and ||V|| -> L2-norm of V
            L2-norm of V -> sqrt(v1^2 + v2^2 + ... + vn^2)
8. Convert Ap to a DataFrame-column
9. Return the resultant Dataframe-column




In [None]:
import numpy
from numpy.linalg import det
import sympy as sp              # Symbolic calculation (using sympy for readability)

def mean_centralize(X):
  X -= X.mean(axis=0)
  return X

def get_eigenValues(A):
  λ = sp.symbols('λ')
  A_sym = sp.Matrix(A)
  char_poly = (A_sym - λ*sp.eye(A.shape[0])).det()
  print("Characteristic polynomial:", char_poly)

  # Solve for eigenvalues
  eigenvalues = sp.solve(char_poly, λ)
  print("Eigenvalues:", eigenvalues)

  return eigenvalues

def calculate_eigenvectors(eigenvalues):
  # Calculate eigenvectors
  eigenvectors = []

  for lam in eigenvalues:
      # Solve (A - λI)v = 0
      M = A - lam * np.eye(A.shape[0])

      # Use SVD to find null space
      _, _, vh = np.linalg.svd(M)
      v = vh[-1]  # last row of V^H gives a vector in the null space
      v = v / np.linalg.norm(v)  # normalize
      eigenvectors.append(v)

    eigenvectors = np.array(eigenvectors)
    return eigenvectors


def PCA(df, n):
  """
  df -> input dataframe
  n -> number of dimensions to reduce into
  return -> reduced dataframe (df)
  """
  # converting df to a numpy_array
  X = df.to_numpy()

  # mean-centralizin X
  X_centered = mean_centralize(X)

  # getting the covariance matrix
  cov_matrix = np.cov(X_centered, rowvar=False)

  # calculating eigenvalues:
  eigenvalues = get_eigenValues(cov_matrix)

  # sorting the eigenvalues in decending order
  eigenvalues.sort()
  eigenvalues = eigenvalues[::-1]  # in-place sort then reverse

  # choosing 'n' number of highest eigenvalues to find their respective eigenvectors
  chosen_eigenvalues = eigenvalues[:n]

  # finding the eigenvectors for the chosen_eigenvalues
  eigenvectors = calculate_eigenvectors(chosen_eigenvalues)

  # Normalize each eigenvector (L2 norm)
  V = eigenvectors / np.linalg.norm(eigenvectors, axis=0)

  # calculating the projected matrix
  Ap = X_centered @ V

  # converting Ap to a df again
  Ap_df = pd.DataFrame(Ap, columns=[f'PC{i+1}' for i in range(Ap.shape[1])])

  # returning the result
  return Ap_df



