# PCA Step-by-Step Practical Example

This notebook demonstrates **Principal Component Analysis (PCA)** step by step on the Iris dataset (numeric features only).  
We will cover **all PCA steps rigorously**, from data preprocessing to eigen decomposition, variance explained, and reconstruction.

---


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Load dataset
iris = load_iris()
X = iris.data[:, :3]   # use first 3 features for easier visualization
df = pd.DataFrame(X, columns=iris.feature_names[:3])
df.head()


In [None]:

# Step 1: Data Preprocessing
X = df.values

# Centering
mean_vec = np.mean(X, axis=0)
X_centered = X - mean_vec

# Optional scaling (standardization)
std_vec = np.std(X, axis=0, ddof=1)
X_scaled = X_centered / std_vec

print("Mean vector:", mean_vec)
print("Standard deviations:", std_vec)


In [None]:

# Step 2: Covariance Matrix
cov_matrix = np.cov(X_centered.T)
print("Covariance matrix:\n", cov_matrix)


In [None]:

# Step 3: Eigen Decomposition
eig_vals, eig_vecs = np.linalg.eigh(cov_matrix)

# Sort eigenvalues & eigenvectors in descending order
idx = np.argsort(eig_vals)[::-1]
eig_vals = eig_vals[idx]
eig_vecs = eig_vecs[:, idx]

print("Eigenvalues:", eig_vals)
print("Eigenvectors (columns are PCs):\n", eig_vecs)


In [None]:

# Step 4: Explained Variance
explained_var_ratio = eig_vals / np.sum(eig_vals)
cum_explained = np.cumsum(explained_var_ratio)

print("Explained variance ratio:", explained_var_ratio)
print("Cumulative explained variance:", cum_explained)

plt.plot(np.arange(1, len(eig_vals)+1), cum_explained, marker='o')
plt.axhline(0.95, color='r', linestyle='--', label='95% threshold')
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.legend()
plt.show()


In [None]:

# Step 5: Projection onto top-k PCs (choose k=2 for 95%+ variance)
k = 2
W = eig_vecs[:, :k]
X_projected = X_centered @ W

print("Shape of projected data:", X_projected.shape)

plt.scatter(X_projected[:,0], X_projected[:,1], c=iris.target, cmap='viridis')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Projection onto first 2 Principal Components")
plt.show()


In [None]:

# Step 6: Reconstruction from reduced space
X_reconstructed = X_projected @ W.T + mean_vec

# Reconstruction error (MSE)
mse = np.mean((X - X_reconstructed)**2)
print("Reconstruction MSE with k=2:", mse)


In [None]:

# Save projected data & reconstruction
output_file = "/mnt/data/pca_projection_and_reconstruction.csv"
pd.DataFrame({
    "PC1": X_projected[:,0],
    "PC2": X_projected[:,1],
    "Target": iris.target
}).to_csv(output_file, index=False)

print("Saved PCA results to:", output_file)
