# PCA Analysis of Physical Data

In [None]:
# Libraries
import pandas as pd
import numpy as np
from pickleshare import PickleShareDB
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

## Load Data

In [None]:
# Load data from prep_data
data_dir = '../prep_data' 
db = PickleShareDB(os.path.join(data_dir, 'kity'))

# Load all physical datasets
df_phy_1 = db['df_phy_1']
df_phy_2 = db['df_phy_2']
df_phy_3 = db['df_phy_3']
df_phy_4 = db['df_phy_4']
df_phy_norm = db['df_phy_norm']

## Data Preparation

In [None]:
# Combine all datasets and add source column
dfs = []
for df, name in zip(
    [df_phy_1, df_phy_2, df_phy_3, df_phy_4, df_phy_norm],
    ['phy_1', 'phy_2', 'phy_3', 'phy_4', 'phy_norm']
):
    df = df.copy()
    df['source'] = name
    dfs.append(df)

df_combined = pd.concat(dfs, ignore_index=True)
print("Combined shape:", df_combined.shape)

In [None]:
# Select features for PCA
# We'll exclude Time, Label columns, and boolean columns
numeric_features = df_combined.select_dtypes(include=[np.number]).columns
features_for_pca = [col for col in numeric_features 
                    if col not in ['Label_n'] 
                    and not col.startswith('Valv_') 
                    and not col.startswith('Pump_')]

print("Features selected for PCA:")
print(features_for_pca)

In [None]:
# Extract features and scale them
X = df_combined[features_for_pca]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for easier handling
X_scaled = pd.DataFrame(X_scaled, columns=features_for_pca)

## Determining Optimal Number of Components

In [None]:
# Fit PCA with maximum number of components
pca_var = PCA(n_components=len(features_for_pca))
pca_var.fit(X_scaled)

# Calculate explained variance and cumulative explained variance
explained_variance = pca_var.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Create DataFrame for visualization
variance_df = pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Explained_Variance': explained_variance,
    'Cumulative_Variance': cumulative_variance
})

# Plot explained variance
fig = go.Figure()

# Bar plot for individual explained variance
fig.add_trace(
    go.Bar(
        x=variance_df['Component'],
        y=variance_df['Explained_Variance'],
        name='Individual',
        text=variance_df['Explained_Variance'].round(3),
        textposition='auto',
    )
)

# Line plot for cumulative explained variance
fig.add_trace(
    go.Scatter(
        x=variance_df['Component'],
        y=variance_df['Cumulative_Variance'],
        name='Cumulative',
        line=dict(color='red'),
        mode='lines+markers'
    )
)

fig.update_layout(
    title='Explained Variance Ratio by Principal Component',
    xaxis_title='Principal Component',
    yaxis_title='Explained Variance Ratio',
    showlegend=True
)

fig.show()

In [None]:
# Print cumulative variance table
print("Cumulative Explained Variance:")
for i, cum_var in enumerate(cumulative_variance):
    print(f"PC{i+1}: {cum_var:.4f}")

Based on the explained variance analysis above, we can choose the optimal number of components. We typically want to capture 80-90% of the variance while keeping the number of components manageable.

In [None]:
# Perform PCA with selected number of components
n_components = 3  # Adjust based on explained variance analysis
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame with transformed data
pca_df = pd.DataFrame(
    X_pca,
    columns=[f'PC{i+1}' for i in range(n_components)]
)

# Add labels and source information
pca_df['Label'] = df_combined['Label']
pca_df['Label_n'] = df_combined['Label_n']
pca_df['source'] = df_combined['source']

## Visualizing PCA Results

In [None]:
# 2D Scatter plot colored by Label
fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='Label',
    title='PCA Results - First Two Components',
    hover_data=['source']
)
fig.show()

In [None]:
# 3D Scatter plot
fig = px.scatter_3d(
    pca_df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='Label',
    title='PCA Results - First Three Components',
    hover_data=['source']
)
fig.show()

## Feature Importance Analysis

In [None]:
# Create feature importance heatmap
components_df = pd.DataFrame(
    pca.components_,
    columns=features_for_pca,
    index=[f'PC{i+1}' for i in range(n_components)]
)

fig = px.imshow(
    components_df,
    title='PCA Components Matrix Heatmap',
    aspect='auto',
    color_continuous_scale='RdBu'
)
fig.show()

In [None]:
# Print top contributing features for each component
for i in range(n_components):
    print(f"\nTop features contributing to PC{i+1}:")
    # Get absolute values of component coefficients
    pc = np.abs(pca.components_[i])
    # Sort features by importance
    feature_importance = sorted(zip(features_for_pca, pc), key=lambda x: x[1], reverse=True)
    for feature, importance in feature_importance[:5]:
        print(f"{feature}: {importance:.3f}")

## Save Results to PickleShareDB

In [None]:
# Save PCA results for use in Streamlit
pca_results = {
    'transformed_data': pca_df,
    'explained_variance': variance_df,
    'components_matrix': components_df,
    'feature_names': features_for_pca,
    'pca_model': pca,
    'scaler': scaler
}

db['pca_results_phy'] = pca_results