# Principal Component Analysis (PCA)

## Iris Dataset

### Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

### Load the iris dataset

In [None]:
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

### Check the feature covariance

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.show()

### Ckeck sample separation

In [None]:
# Scatter plot of the iris dataset
_, ax = plt.subplots()
scatter = ax.scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target)
ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])
_ = ax.legend(
    scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes"
)

### Apply PCA

In [None]:
# unused but required import for doing 3d projections with matplotlib < 3.2
# import mpl_toolkits.mplot3d  # noqa: F401
from sklearn.decomposition import PCA

pca = PCA()
X = pca.fit_transform(iris.data)

# scree plot
print(pca.explained_variance_ratio_)
plt.bar(x=['PC1','PC2','PC3','PC4'], height=100*pca.explained_variance_ratio_)
plt.title('Scree Plot', fontsize=16)
plt.ylabel('Explained variance, %')
plt.yticks(np.arange(0, 101, 10))
plt.grid(axis='y')
plt.show()

In [None]:
fig = plt.figure(1, figsize=(8, 6))
ax = fig.add_subplot(111)
ax.scatter(
    X[:, 0],
    X[:, 1],
    c=iris.target,
    s=40,
)
ax.legend(
    scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes"
)

ax.set_title("First two PCA dimensions")
ax.set_xlabel("1st Eigenvector")
ax.xaxis.set_ticklabels([])
ax.set_ylabel("2nd Eigenvector")
plt.show()

### PCA loadings

In [None]:
loadings = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_), 
                        columns=['PC1','PC2','PC3','PC4'], index=iris.feature_names)
sns.heatmap(loadings, cmap='coolwarm', annot=True)
plt.show()

In [None]:
# What are the important features for each Principal Component?

# Set a threshold for which features to extract
threshold = 0.3

# Find features with loadings above the threshold for each principal component
important_features = {}
for column in loadings.columns:
    important_features[column] = loadings.index[loadings[column].abs() > threshold].tolist()

# Now 'important_features' dictionary contains the important features for each PC
for pc, features in important_features.items():
    print(f"{pc}: {', '.join(features)}")
    print()

## Beast Cancer Dataset

In [None]:
# import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
#import the breast _cancer dataset
from sklearn.datasets import load_breast_cancer
data=load_breast_cancer()
data.keys()

# Check the output classes
print(data['target_names'])

# Check the input attributes
print(data['feature_names'])

In [None]:
# construct a dataframe using pandas
df1=pd.DataFrame(data['data'],columns=data['feature_names'])

# Scale data before applying PCA
scaling=StandardScaler()

# Use fit and transform method 
scaling.fit(df1)
Scaled_data=scaling.transform(df1)

# Apply PCA
principal=PCA()
principal.fit(Scaled_data)
x=principal.transform(Scaled_data)

# Check the dimensions of data after PCA
print(x.shape)

In [None]:
# Plot Explained Variance Ratio
explained_var_ratio = principal.explained_variance_ratio_

plt.plot(range(1, len(explained_var_ratio) + 1), explained_var_ratio, marker='o')
# plt.bar(range(1, len(explained_var_ratio) + 1), explained_var_ratio, edgecolor='k', color='none')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot', fontsize=16)
plt.grid(axis='y')
plt.show()

In [None]:
# Plot Explained Variance Ratio
explained_var_ratio = principal.explained_variance_ratio_
cumulative_var_ratio = np.cumsum(explained_var_ratio)

plt.plot(range(1, len(cumulative_var_ratio) + 1), cumulative_var_ratio, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Cummulative Explained Variance Ratio', fontsize=16)
plt.axhline(y=0.95, color='k', linestyle='-.')  # 95% variance line
plt.text(0.2, 0.96, '95% cut-off threshold', color = 'b', fontsize=14)
plt.grid()
plt.show()

In [None]:
# 2d scores plot
plt.figure(figsize=(5,5))
plt.scatter(x[:,0],x[:,1],c=data['target'],cmap='plasma')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('pc1', fontsize=12)
plt.ylabel('pc2', fontsize=12)

In [None]:
# 3d scores plot
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,8))

# choose projection 3d for creating a 3d graph
axis = fig.add_subplot(111, projection='3d')

# x[:,0]is pc1,x[:,1] is pc2 while x[:,2] is pc3
axis.scatter(x[:,0],x[:,1],x[:,2], c=data['target'],cmap='winter')

axis.set_xlabel("PC1")
axis.set_ylabel("PC2")
axis.set_zlabel("PC3")
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.show()


In [None]:
# check how much variance is explained by first 10 principal components
print(principal.explained_variance_ratio_[:10])

In [None]:
# Check the values of eigen vectors
# prodeced by principal components
loadings = pd.DataFrame(principal.components_.T * np.sqrt(principal.explained_variance_),
                        columns=[f'PC{i+1}' for i in range(0, principal.components_.shape[0])],
                        index=data['feature_names']
                       )
loadings.head()

In [None]:
# Let’s see a heatmap of all Principal Components:

import seaborn as sns

# Create a heatmap for the loadings
plt.figure(figsize=(12, 8))
# sns.heatmap(loadings, annot=True, annot_kws={"size":4}, cmap='BrBG')
sns.heatmap(loadings, cmap='coolwarm')
plt.title('PCA Loadings Heatmap')
plt.show()

In [None]:
# What are the important features for each Principal Component?

# Assuming loadings is a pandas DataFrame with PCA loadings as given above
import pandas as pd

# Set a threshold for which features to extract
threshold = 0.45

# Find features with loadings above the threshold for each principal component
important_features = {}
for column in loadings.columns:
    important_features[column] = loadings.index[loadings[column].abs() > threshold].tolist()

# Now 'important_features' dictionary contains the important features for each PC
for pc, features in important_features.items():
    print(f"{pc}: {', '.join(features)}")
    print()