In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Load data
iris = load_iris()
X = iris.data
y = iris.target
labels = iris.target_names

# Step 2: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Apply PCA (keep all components to see full variance)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Step 4: Scree plot
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.grid(True)
plt.show()

# Step 5: Reduce to 2D and visualize
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)

# Convert to DataFrame
df = pd.DataFrame(X_2d, columns=['PC1', 'PC2'])
df['Species'] = [labels[i] for i in y]

# Step 6: Scatter plot
plt.figure(figsize=(8, 6))
for species in df['Species'].unique():
    subset = df[df['Species'] == species]
    plt.scatter(subset['PC1'], subset['PC2'], label=species)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2D PCA of Iris Dataset')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data                # Feature matrix
y = iris.target              # Target labels (0, 1, 2)
labels = iris.target_names   # Label names (setosa, versicolor, virginica)
features = iris.feature_names

# Step 2: Standardize the data (mean = 0, std = 1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Apply PCA with all components to understand variance distribution
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Step 4: Print explained variance ratio for each component
print("Explained variance by each Principal Component:")
for i, var in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {var:.2%}")

# Step 5: Scree plot (individual + cumulative variance)
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance)+1), explained_variance, marker='o', label='Individual Variance')
plt.plot(range(1, len(cumulative_variance)+1), cumulative_variance, marker='s', linestyle='--', label='Cumulative Variance')
plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.xticks(range(1, len(explained_variance)+1))
plt.grid(True)
plt.legend()
plt.show()

# Step 6: Reduce to 2 principal components for 2D visualization
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)

# Create DataFrame for 2D plot
df_2d = pd.DataFrame(X_2d, columns=['PC1', 'PC2'])
df_2d['Species'] = [labels[i] for i in y]

# Step 7: 2D Scatter Plot
plt.figure(figsize=(8, 6))
for species in df_2d['Species'].unique():
    subset = df_2d[df_2d['Species'] == species]
    plt.scatter(subset['PC1'], subset['PC2'], label=species)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2D PCA of Iris Dataset')
plt.legend()
plt.grid(True)
plt.show()

# Step 8: Reduce to 3 components for 3D visualization
pca_3d = PCA(n_components=3)
X_3d = pca_3d.fit_transform(X_scaled)

# Step 9: 3D Scatter Plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

for idx, label in enumerate(labels):
    ax.scatter(
        X_3d[y == idx, 0],
        X_3d[y == idx, 1],
        X_3d[y == idx, 2],
        label=label,
        s=60
    )

ax.set_title("3D PCA of Iris Dataset")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
ax.legend()
plt.show()


In [None]:
Detailed Explanation of the Code:
python
Copy
Edit
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
Imports:
numpy: For numerical operations like array manipulations.

pandas: For data handling and manipulation.

matplotlib.pyplot: For plotting graphs and visualizations.

load_iris: From sklearn.datasets, used to load the famous Iris dataset.

StandardScaler: From sklearn.preprocessing, used to standardize the dataset (mean = 0, standard deviation = 1).

PCA: From sklearn.decomposition, used to perform Principal Component Analysis.

Axes3D: For 3D plotting, which will be used later to visualize data in 3D.

python
Copy
Edit
# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data                # Feature matrix (input data)
y = iris.target              # Target labels (output data)
labels = iris.target_names   # Target names (species: setosa, versicolor, virginica)
features = iris.feature_names # Feature names (sepal length, sepal width, petal length, petal width)
Step 1: Loading Data:
The Iris dataset is loaded using load_iris(), which provides the data in a structured form. This dataset consists of 4 features (sepal length, sepal width, petal length, petal width) and 150 samples of Iris flowers, divided into three species: Setosa, Versicolor, and Virginica.

X contains the feature values (numerical data).

y contains the labels (target values), which represent the species of each flower.

labels stores the species names for clarity, and features stores the names of the attributes (sepal/petal dimensions).

python
Copy
Edit
# Step 2: Standardize the data (mean = 0, std = 1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Step 2: Standardizing the Data:
PCA is sensitive to the scale of data. If the data is not standardized, features with larger ranges can dominate the principal components.

To avoid this, we use StandardScaler to standardize the data by subtracting the mean and dividing by the standard deviation. This gives us a dataset where each feature has a mean of 0 and a standard deviation of 1.

X_scaled is the standardized version of the dataset.

python
Copy
Edit
# Step 3: Apply PCA with all components to understand variance distribution
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
Step 3: Apply PCA:
We create an instance of the PCA class and fit it to the standardized data (X_scaled) using the fit_transform method. This performs PCA on the entire dataset.

X_pca contains the transformed data in terms of principal components (PCs). By default, PCA will compute as many components as there are features (4 components in this case).

python
Copy
Edit
# Step 4: Print explained variance ratio for each component
print("Explained variance by each Principal Component:")
for i, var in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {var:.2%}")
Step 4: Explained Variance:
PCA identifies the directions (principal components) in which the data has the most variance. Each principal component explains a certain percentage of the total variance in the dataset.

We print the explained variance ratio for each principal component. This tells us how much of the data's variance is captured by each component.

pca.explained_variance_ratio_ provides the percentage of variance explained by each PC. We print this out to see how much each PC contributes to the overall data variation.

python
Copy
Edit
# Step 5: Scree plot (individual + cumulative variance)
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance)+1), explained_variance, marker='o', label='Individual Variance')
plt.plot(range(1, len(cumulative_variance)+1), cumulative_variance, marker='s', linestyle='--', label='Cumulative Variance')
plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.xticks(range(1, len(explained_variance)+1))
plt.grid(True)
plt.legend()
plt.show()
Step 5: Scree Plot:
The Scree plot is used to visualize the explained variance of each principal component.

We plot both:

Individual variance (variance explained by each component).

Cumulative variance (total variance explained up to that component).

The scree plot helps us understand how many components are required to capture a significant portion of the variance. You can identify the "elbow" point in the plot, which suggests the optimal number of components.

python
Copy
Edit
# Step 6: Reduce to 2 principal components for 2D visualization
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)
Step 6: 2D PCA:
To reduce the dimensionality of the dataset for easier visualization, we perform PCA with 2 components (n_components=2).

X_2d is the transformed dataset with only the first two principal components. This allows us to plot the data in 2D.

python
Copy
Edit
# Create DataFrame for 2D plot
df_2d = pd.DataFrame(X_2d, columns=['PC1', 'PC2'])
df_2d['Species'] = [labels[i] for i in y]
Create DataFrame for 2D Visualization:
We convert the reduced 2D data (X_2d) into a Pandas DataFrame for easy plotting.

The Species column is added to label each data point according to its species (Setosa, Versicolor, or Virginica).

python
Copy
Edit
# Step 7: 2D Scatter Plot
plt.figure(figsize=(8, 6))
for species in df_2d['Species'].unique():
    subset = df_2d[df_2d['Species'] == species]
    plt.scatter(subset['PC1'], subset['PC2'], label=species)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2D PCA of Iris Dataset')
plt.legend()
plt.grid(True)
plt.show()
Step 7: 2D Scatter Plot:
A scatter plot is created to visualize the data in 2D using the first two principal components (PC1 and PC2).

Each point is colored according to its species, making it easy to visually separate the Iris species based on their principal component values.

python
Copy
Edit
# Step 8: Reduce to 3 components for 3D visualization
pca_3d = PCA(n_components=3)
X_3d = pca_3d.fit_transform(X_scaled)
Step 8: 3D PCA:
To explore the dataset in 3D, we reduce the data to 3 components (n_components=3) and store the result in X_3d.

This step helps visualize the data in 3D space.

python
Copy
Edit
# Step 9: 3D Scatter Plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

for idx, label in enumerate(labels):
    ax.scatter(
        X_3d[y == idx, 0],
        X_3d[y == idx, 1],
        X_3d[y == idx, 2],
        label=label,
        s=60
    )

ax.set_title("3D PCA of Iris Dataset")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
ax.legend()
plt.show()
Step 9: 3D Scatter Plot:
A 3D scatter plot is created to visualize the data points in three dimensions using the first three principal components.

The points are color-coded based on the species, making it easier to separate the classes visually in 3D space.

The projection='3d' argument allows us to create a 3D plot.

In [None]:
Concept of PCA:
PCA (Principal Component Analysis) is a technique used for dimensionality reduction in datasets with multiple variables.

It transforms the original data into a new set of orthogonal axes called Principal Components (PCs).

The goal is to reduce the number of variables while retaining as much variance as possible from the data.

PCA is widely used for:

Data visualization (reducing dimensions to 2D or 3D).

Noise reduction.

Feature extraction.

Steps in PCA:
Standardize the Data:

Standardize the data to ensure each feature has a mean of 0 and a standard deviation of 1.

Formula:

𝑍
=
𝑋
−
𝜇
𝜎
Z=
σ
X−μ
​

where:

𝑋
X is the original data.

𝜇
μ is the mean of each feature.

𝜎
σ is the standard deviation of each feature.

𝑍
Z is the standardized data.

Compute Covariance Matrix:

The covariance matrix captures the relationship between different features in the data.

Formula for covariance between two variables
𝑥
x and
𝑦
y:

Cov
(
𝑥
,
𝑦
)
=
1
𝑛
−
1
∑
𝑖
=
1
𝑛
(
𝑥
𝑖
−
𝑥
ˉ
)
(
𝑦
𝑖
−
𝑦
ˉ
)
Cov(x,y)=
n−1
1
​

i=1
∑
n
​
 (x
i
​
 −
x
ˉ
 )(y
i
​
 −
y
ˉ
​
 )
𝑛
n is the number of samples.

𝑥
𝑖
,
𝑦
𝑖
x
i
​
 ,y
i
​
  are the individual data points.

𝑥
ˉ
,
𝑦
ˉ
x
ˉ
 ,
y
ˉ
​
  are the mean values of
𝑥
x and
𝑦
y.

Compute Eigenvalues and Eigenvectors:

Eigenvalues and eigenvectors are used to define the principal components.

Eigenvectors give the direction of the new axes (principal components).

Eigenvalues indicate the variance along the respective eigenvector.

Sort Eigenvalues:

The eigenvectors are sorted based on their eigenvalues in descending order.

The largest eigenvalue corresponds to the principal component with the most variance.

Select the Top k Principal Components:

Choose the top k eigenvectors corresponding to the largest eigenvalues to form the new set of features (the reduced dataset).

Transform the Data:

The final step is to transform the original data into the new space defined by the selected eigenvectors (principal components).

Formula:

𝑋
new
=
𝑍
×
𝑉
X
new
​
 =Z×V
where:

𝑍
Z is the standardized data matrix.

𝑉
V is the matrix of eigenvectors (principal components).

𝑋
new
X
new
​
  is the transformed data in the reduced dimensional space.

Key Terms:
Variance: The amount of spread or dispersion in the dataset. PCA seeks to maximize variance along the new principal components.

Covariance Matrix: A square matrix that shows the covariance between different features in the dataset.

Eigenvalues: Measure the amount of variance captured by the corresponding eigenvector (principal component).

Eigenvectors: Define the direction of the new axes (principal components) in the transformed space.

Formulae:
Covariance (between two features
𝑥
x and
𝑦
y):

Cov
(
𝑥
,
𝑦
)
=
1
𝑛
−
1
∑
𝑖
=
1
𝑛
(
𝑥
𝑖
−
𝑥
ˉ
)
(
𝑦
𝑖
−
𝑦
ˉ
)
Cov(x,y)=
n−1
1
​

i=1
∑
n
​
 (x
i
​
 −
x
ˉ
 )(y
i
​
 −
y
ˉ
​
 )
Standardization:

𝑍
=
𝑋
−
𝜇
𝜎
Z=
σ
X−μ
​

where:

𝜇
μ is the mean.

𝜎
σ is the standard deviation.

PCA Transformation:

𝑋
new
=
𝑍
×
𝑉
X
new
​
 =Z×V
where:

𝑍
Z is the standardized data.

𝑉
V is the matrix of eigenvectors (principal components).

𝑋
new
X
new
​
  is the transformed data.

Eigenvalue Decomposition:

𝐶
⋅
𝑉
=
𝜆
⋅
𝑉
C⋅V=λ⋅V
where:

𝐶
C is the covariance matrix.

𝑉
V is the eigenvector matrix.

𝜆
λ is the eigenvalue matrix.

Explained Variance (per principal component):

Explained Variance Ratio
=
𝜆
𝑖
∑
𝑖
=
1
𝑛
𝜆
𝑖
Explained Variance Ratio=
∑
i=1
n
​
 λ
i
​

λ
i
​

​

where:

𝜆
𝑖
λ
i
​
  is the eigenvalue for the
𝑖
i-th principal component.

Important Points to Remember:
Standardization is essential to make sure that the features with larger ranges don't dominate the PCA transformation.

The principal components (PCs) are orthogonal (uncorrelated) directions that maximize the variance in the data.

The first principal component (PC1) captures the highest variance, PC2 captures the second-highest variance, and so on.

Dimensionality reduction: By selecting only the top k PCs, you reduce the data's dimensionality while retaining most of the variance.

The number of dimensions to reduce to (k) is often chosen by looking at the cumulative explained variance or the scree plot.

Common Applications of PCA:
Data visualization: Reducing the dataset to 2 or 3 dimensions for easier visualization.

Noise reduction: By keeping only the most important principal components, you can reduce noise in the data.

Feature extraction: Identifying the most important features (principal components) for downstream tasks like classification or clustering.