<a href="https://colab.research.google.com/github/vmjs1234/ADVANCED_DIMENSIONALITY_REDUCTION/blob/main/TABULAR/Dimensionality_Reduction_Tabular.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install UMAP
!pip install umap-learn --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/88.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

# Import Libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px


In [3]:

from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import FactorAnalysis, IncrementalPCA, KernelPCA, PCA
from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, TSNE
from umap import UMAP
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [4]:

# Load Dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names


In [5]:


import pandas as pd

# Assuming 'X' and 'feature_names' are defined from the previous code
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [6]:

# Convert to DataFrame
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


#1. Principal Component Analysis (PCA):
Description: PCA reduces dimensions by finding principal axes that maximize variance.
Results:
Clusters appear spread out based on variance.
PCA is computationally efficient and well-suited for linear separability.
Limitations:
Struggles with non-linear structures in data.


In [7]:
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)


In [17]:

# Visualize
methods = {'PCA': X_pca}
for name, data in methods.items():
    fig = px.scatter(x=data[:, 0], y=data[:, 1], color=y.astype(str),
                     title=f"{name} Visualization",
                     labels={'color': 'Target'})
    fig.show()


#2. Randomized PCA:
Description: A variant of PCA optimized for large datasets using randomized algorithms.
Results:
Similar visual results to standard PCA.
Faster computational performance on large datasets.
Limitations:
May lose some accuracy in variance capture.


In [8]:

# Apply Randomized PCA
rpca = PCA(n_components=2, svd_solver='randomized')
X_rpca = rpca.fit_transform(X_scaled)


In [14]:

# Visualize
methods = {'Randomized PCA': X_rpca}
for name, data in methods.items():
    fig = px.scatter(x=data[:, 0], y=data[:, 1], color=y.astype(str),
                     title=f"{name} Visualization",
                     labels={'color': 'Target'})
    fig.show()


#3. Kernel PCA:
Description: Extends PCA using kernels to handle non-linear structures.
Results:
Captures non-linear patterns effectively.
Shows better cluster separation compared to standard PCA.
Limitations:
More computationally expensive due to kernel matrix computation.

In [11]:

# Apply Kernel PCA
kpca = KernelPCA(n_components=2, kernel='rbf')
X_kpca = kpca.fit_transform(X_scaled)


In [15]:

# Visualize
methods = {'Kernel PCA': X_kpca}
for name, data in methods.items():
    fig = px.scatter(x=data[:, 0], y=data[:, 1], color=y.astype(str),
                     title=f"{name} Visualization",
                     labels={'color': 'Target'})
    fig.show()


#4. Incremental PCA:
Description: Processes data in smaller batches, suitable for memory-constrained systems.
Results:
Slightly reduced variance capture compared to standard PCA.
Useful for streaming or incremental data.
Limitations:
Less effective than standard PCA for datasets that fit in memory.

In [12]:

# Apply Incremental PCA
ipca = IncrementalPCA(n_components=2, batch_size=10)
X_ipca = ipca.fit_transform(X_scaled)


In [16]:

# Visualize
methods = {'Incremental PCA': X_ipca}
for name, data in methods.items():
    fig = px.scatter(x=data[:, 0], y=data[:, 1], color=y.astype(str),
                     title=f"{name} Visualization",
                     labels={'color': 'Target'})
    fig.show()


#5. t-SNE:
Description: Non-linear technique focused on preserving local relationships in data.
Results:
Visualizes clusters distinctly and preserves local structures.
Effective for exploratory analysis.
Limitations:
High computational cost.
Results may vary based on hyperparameters (e.g., perplexity).

In [18]:
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# Visualize
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=y.astype(str),
                 title="t-SNE Visualization",
                 labels={'color': 'Target'})
fig.show()


#6. UMAP:
Description: Non-linear method similar to t-SNE but computationally more efficient.
Results:
Better global structure preservation compared to t-SNE.
Computationally faster and highly effective for visualization.
Limitations:
Results can be sensitive to parameters like n_neighbors.

In [19]:
# Apply UMAP
umap = UMAP(n_components=2, random_state=42)
X_umap = umap.fit_transform(X_scaled)

# Visualize
fig = px.scatter(x=X_umap[:, 0], y=X_umap[:, 1], color=y.astype(str),
                 title="UMAP Visualization",
                 labels={'color': 'Target'})
fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



#7. Isomap:
Description: Captures global structures while preserving geodesic distances on a manifold.
Results:
Shows distinct clusters in curved manifolds.
Suitable for manifold learning tasks.
Limitations:
Struggles with noisy data.

In [20]:
# Apply Isomap
isomap = Isomap(n_components=2)
X_isomap = isomap.fit_transform(X_scaled)

# Visualize
fig = px.scatter(x=X_isomap[:, 0], y=X_isomap[:, 1], color=y.astype(str),
                 title="Isomap Visualization",
                 labels={'color': 'Target'})
fig.show()


#8. Locally Linear Embedding (LLE):
Description: Focuses on preserving local relationships within the data manifold.
Results:
Effective for non-linear patterns.
Clusters appear locally coherent but may lack global structure.
Limitations:
Computationally expensive for large datasets.

In [21]:
# Apply LLE
lle = LocallyLinearEmbedding(n_components=2)
X_lle = lle.fit_transform(X_scaled)

# Visualize
fig = px.scatter(x=X_lle[:, 0], y=X_lle[:, 1], color=y.astype(str),
                 title="LLE Visualization",
                 labels={'color': 'Target'})
fig.show()


#9. Multidimensional Scaling (MDS):
Description: Preserves pairwise distances for dimensionality reduction.
Results:
Maintains data geometry well.
Useful for visualizing similarity data.
Limitations:
High computational cost.

In [22]:
# Apply MDS
mds = MDS(n_components=2, random_state=42)
X_mds = mds.fit_transform(X_scaled)

# Visualize
fig = px.scatter(x=X_mds[:, 0], y=X_mds[:, 1], color=y.astype(str),
                 title="MDS Visualization",
                 labels={'color': 'Target'})
fig.show()


#10. Factor Analysis:
Description: Identifies latent variables or factors that explain variance.
Results:
Captures shared variance effectively.
Less effective for non-linear data.
Limitations:
Not designed for visualization tasks.


In [23]:
# Apply Factor Analysis
fa = FactorAnalysis(n_components=2)
X_fa = fa.fit_transform(X_scaled)

# Visualize
fig = px.scatter(x=X_fa[:, 0], y=X_fa[:, 1], color=y.astype(str),
                 title="Factor Analysis Visualization",
                 labels={'color': 'Target'})
fig.show()


11. Autoencoder:
Description: Neural network-based method that learns a compressed representation of data.
Results:
Provides flexible and powerful representations.
Captures both linear and non-linear patterns effectively.
Limitations:
Requires significant training time and parameter tuning.

In [25]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

# Define Autoencoder
input_dim = X_scaled.shape[1]
inputs = Input(shape=(input_dim,))  # Define Input layer
x = Dense(64, activation='relu')(inputs)  # Connect Input to Dense layer
x = Dense(2, activation='relu')(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(input_dim, activation='linear')(x)

autoencoder = Model(inputs=inputs, outputs=outputs)  # Create Model

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=16, verbose=0)

# Extract Latent Space
encoder = Model(inputs=inputs, outputs=autoencoder.layers[2].output)  # Extract encoder part
X_autoencoder = encoder.predict(X_scaled)

# Visualize
fig = px.scatter(x=X_autoencoder[:, 0], y=X_autoencoder[:, 1], color=y.astype(str),
                 title="Autoencoder Visualization",
                 labels={'color': 'Target'})
fig.show()

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
