<a href="https://colab.research.google.com/github/ubaidillah-chem/fouling-ml/blob/main/02_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Connect to the Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Load the re-uploaded dataset
df = pd.read_csv("gdrive/MyDrive/dataset.csv")

# Split features and target
X = df.drop(columns=['Rf'])
y = df['Rf']

# Standardize the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to retain 95% of the variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame with PCA results
pca_columns = [f"PC{i+1}" for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca['Rf'] = y.values  # Reattach target column


# Save to CSV
df_pca.to_csv("pca_transformed_data.csv", index=False)


In [None]:
# Simply show the DataFrame
df_pca.head()  # Show first 5 rows


In [None]:
X.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Prepare pairwise combinations of all PCs
pc_columns = [col for col in df_pca.columns if col.startswith("PC")]

# Assume pc_columns = ['PC1', 'PC2', 'PC3', ..., 'PC6']
pair_combinations = []

for i in range(len(pc_columns)):
    for j in range(i + 1, len(pc_columns)):
        pair_combinations.append((pc_columns[i], pc_columns[j]))

# Plot all pairwise combinations in a grid
n = len(pc_columns)
fig, axes = plt.subplots(nrows=n-1, ncols=n-1, figsize=(16, 16))
plt.subplots_adjust(hspace=0.4, wspace=0.4)

# Plot each subplot only in the lower triangle
for i in range(n-1):
    for j in range(i+1):
        ax = axes[i, j]
        sns.scatterplot(data=df_pca, x=pc_columns[j], y=pc_columns[i+1], hue='Rf',
                        palette='coolwarm', s=10, ax=ax, legend=True)
        ax.set_xlabel(pc_columns[j])
        ax.set_ylabel(pc_columns[i+1])

# Hide upper triangle subplots and unused axes
for i in range(n-1):
    for j in range(i+1, n-1):
        axes[i, j].set_visible(False)

plt.suptitle("Pairwise PCA Scatter Plots Colored by Rf", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.legend()
plt.show()

In [None]:
# Compute PCA loadings (contributions of original features to PCs)
loadings = pd.DataFrame(pca.components_.T, index=X.columns, columns=[f'PC{i+1}' for i in range(n)])

# Get top 10 contributing features for each PC
top_features_set = set()
for pc in loadings.columns:
    top_features = loadings[pc].abs().sort_values(ascending=False).head(10).index.tolist()
    top_features_set.update(top_features)

# Get features to drop (those not in top_features_set)
features_to_drop = set(df.columns) - top_features_set - {'Rf'}

# Create filtered DataFrame
filtered_df = df.drop(columns=features_to_drop)
filtered_df.to_csv('/content/gdrive/MyDrive/dataset_filtered_by_top_pca_loadings.csv', index=False)

print(f"Filtered dataset shape: {filtered_df.shape}")
# Plot top 10 contributors for all PCs with loading values labeled
n_comps = loadings.shape[1]
plt.figure(figsize=(14, 16))

for i in range(n_comps):  # PC1 to PCn
    pc_name = f'PC{i+1}'
    top_features = loadings[pc_name].abs().sort_values(ascending=False).head(10)
    top_features_names = top_features.index
    top_values = loadings.loc[top_features_names, pc_name]

    ax = plt.subplot(int(np.ceil(n_comps/2)), 2, i+1)
    bars = ax.barh(top_features_names[::-1], top_values[::-1], color='skyblue')
    ax.set_title(f"Top 10 Feature Contributions to {pc_name}")
    ax.set_xlabel("Loading Value")
    ax.grid(True)

    # Add text labels on the bars
    for bar in bars:
        value = bar.get_width()
        ax.text(
            value + 0.01 * (1 if value > 0 else -1),  # offset slightly
            bar.get_y() + bar.get_height() / 2,
            f"{value:.4f}",
            va='center',
            ha='left' if value > 0 else 'right',
            fontsize=9,
            color='black'
        )

plt.tight_layout()
plt.show()
