In [None]:
# Step 1: Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 2: Load the Breast Cancer dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
target_names = cancer.target_names
feature_names = cancer.feature_names

# Display dataset information
print("Shape of dataset:", X.shape)
print("Feature names:", list(feature_names))
print("Target classes:", target_names)

# Step 3: Standardize the data
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Step 4: Apply PCA to compute explained variance
pca = PCA()
X_pca = pca.fit_transform(X_std)

# (a) Variance explained by each component
explained_variance_ratio = pca.explained_variance_ratio_
print("\nExplained variance ratio (per component):\n", explained_variance_ratio)

# (b) Cumulative variance and number of components for 90% variance
cumulative_variance = np.cumsum(explained_variance_ratio)
n_components_90 = np.argmax(cumulative_variance >= 0.90) + 1
print(f"\nNumber of components required to explain at least 90% variance: {n_components_90}")

# Step 5: Plot cumulative explained variance
plt.figure(figsize=(8,5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='teal')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% Variance Threshold')
plt.title("Cumulative Explained Variance (Breast Cancer Dataset)")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.legend()
plt.grid(True)
plt.show()

# Step 6: Reduce data to 2 principal components
pca_2d = PCA(n_components=2)
X_reduced = pca_2d.fit_transform(X_std)

# Step 7: Visualize PCA results in 2D
plt.figure(figsize=(8,6))
colors = ['crimson', 'seagreen']

for color, target_name, i in zip(colors, target_names, [0, 1]):
    plt.scatter(
        X_reduced[y == i, 0],
        X_reduced[y == i, 1],
        color=color,
        lw=2,
        label=target_name,
        alpha=0.7
    )

plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA on Breast Cancer Dataset (2 Components)")
plt.legend()
plt.show()

# Step 8: Print explained variance for first two components
explained_var_2 = pca_2d.explained_variance_ratio_
print("\nExplained variance ratio (2 components):", explained_var_2)
print(f"Total variance explained by first 2 components: {np.sum(explained_var_2) * 100:.2f}%")
