# Analyse ACP des données physiques

In [1]:
# Libraries
import pandas as pd
import numpy as np
from pickleshare import PickleShareDB
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

## Chargement des données

In [2]:
# Load data from prep_data
data_dir = '../prep_data' 
db = PickleShareDB(os.path.join(data_dir, 'kity'))

# Load all physical datasets
df_phy_1 = db['df_phy_1']
df_phy_2 = db['df_phy_2']
df_phy_3 = db['df_phy_3']
df_phy_4 = db['df_phy_4']
df_phy_norm = db['df_phy_norm']

## Préparation des données

In [3]:
# Combine all datasets and add source column
dfs = []
for df, name in zip(
    [df_phy_1, df_phy_2, df_phy_3, df_phy_4, df_phy_norm],
    ['phy_1', 'phy_2', 'phy_3', 'phy_4', 'phy_norm']
):
    df = df.copy()
    df['source'] = name
    dfs.append(df)

df_combined = pd.concat(dfs, ignore_index=True)
print("Combined shape:", df_combined.shape)

Combined shape: (10923, 30)


In [4]:
# Select features for PCA
# We'll exclude Time, Label columns, and boolean columns
numeric_features = df_combined.select_dtypes(include=[np.number]).columns
features_for_pca = [col for col in numeric_features 
                    if col not in ['Label_n'] 
                    and not col.startswith('Valv_') 
                    and not col.startswith('Pump_')]

print("Features selected for PCA:")
print(features_for_pca)

Features selected for PCA:
['Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6', 'Tank_7', 'Tank_8', 'Flow_sensor_1', 'Flow_sensor_4']


In [5]:
# Extract features and scale them
X = df_combined[features_for_pca]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for easier handling
X_scaled = pd.DataFrame(X_scaled, columns=features_for_pca)

## Détermination du nombre optimal de composantes

En se basant sur l'analyse de la variance expliquée ci-dessus, nous pouvons choisir le nombre optimal de composantes. Nous cherchons typiquement à capturer 80-90% de la variance tout en gardant un nombre de composantes gérable.

In [6]:
# Fit PCA with maximum number of components
pca_var = PCA(n_components=len(features_for_pca))
pca_var.fit(X_scaled)

# Calculate explained variance and cumulative explained variance
explained_variance = pca_var.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Create DataFrame for visualization
variance_df = pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Explained_Variance': explained_variance,
    'Cumulative_Variance': cumulative_variance
})

# Plot explained variance
fig = go.Figure()

# Bar plot for individual explained variance
fig.add_trace(
    go.Bar(
        x=variance_df['Component'],
        y=variance_df['Explained_Variance'],
        name='Individual',
        text=variance_df['Explained_Variance'].round(3),
        textposition='auto',
    )
)

# Line plot for cumulative explained variance
fig.add_trace(
    go.Scatter(
        x=variance_df['Component'],
        y=variance_df['Cumulative_Variance'],
        name='Cumulative',
        line=dict(color='red'),
        mode='lines+markers'
    )
)

fig.update_layout(
    title='Explained Variance Ratio by Principal Component',
    xaxis_title='Principal Component',
    yaxis_title='Explained Variance Ratio',
    showlegend=True
)

fig.show()

In [7]:
# Print cumulative variance table
print("Cumulative Explained Variance:")
for i, cum_var in enumerate(cumulative_variance):
    print(f"PC{i+1}: {cum_var:.4f}")

Cumulative Explained Variance:
PC1: 0.3230
PC2: 0.5901
PC3: 0.7280
PC4: 0.8029
PC5: 0.8698
PC6: 0.9160
PC7: 0.9485
PC8: 0.9713
PC9: 0.9870
PC10: 1.0000


In [8]:
# Perform PCA with selected number of components
n_components = 5
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame with transformed data
pca_df = pd.DataFrame(
    X_pca,
    columns=[f'PC{i+1}' for i in range(n_components)]
)

# Add labels and source information
pca_df['Label'] = df_combined['Label']
pca_df['Label_n'] = df_combined['Label_n']
pca_df['source'] = df_combined['source']

## Visualisation des résultats ACP

In [9]:
# 2D Scatter plot colored by Label
fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='Label',
    title='PCA Results - First Two Components',
    hover_data=['source']
)
fig.show()

In [10]:
# 3D Scatter plot
fig = px.scatter_3d(
    pca_df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='Label',
    title='PCA Results - First Three Components',
    hover_data=['source']
)
fig.show()

## Analyse de l'importance des caractéristiques  

In [11]:
# Create feature importance heatmap
components_df = pd.DataFrame(
    pca.components_,
    columns=features_for_pca,
    index=[f'PC{i+1}' for i in range(n_components)]
)

fig = px.imshow(
    components_df,
    title='PCA Components Matrix Heatmap',
    aspect='auto',
    color_continuous_scale='RdBu'
)
fig.show()

In [12]:
# Print top contributing features for each component
for i in range(n_components):
    print(f"\nTop features contributing to PC{i+1}:")
    # Get absolute values of component coefficients
    pc = np.abs(pca.components_[i])
    # Sort features by importance
    feature_importance = sorted(zip(features_for_pca, pc), key=lambda x: x[1], reverse=True)
    for feature, importance in feature_importance[:5]:
        print(f"{feature}: {importance:.3f}")


Top features contributing to PC1:
Flow_sensor_4: 0.445
Flow_sensor_1: 0.431
Tank_6: 0.414
Tank_8: 0.409
Tank_5: 0.393

Top features contributing to PC2:
Tank_1: 0.483
Tank_7: 0.479
Tank_4: 0.478
Tank_5: 0.314
Tank_6: 0.308

Top features contributing to PC3:
Tank_2: 0.658
Tank_3: 0.634
Tank_8: 0.249
Tank_4: 0.196
Tank_7: 0.189

Top features contributing to PC4:
Tank_3: 0.629
Flow_sensor_4: 0.416
Tank_2: 0.368
Tank_8: 0.314
Tank_6: 0.251

Top features contributing to PC5:
Tank_2: 0.607
Flow_sensor_1: 0.401
Flow_sensor_4: 0.363
Tank_3: 0.328
Tank_8: 0.275


## Sauvegarder les résultats dans PickleShareDB

In [13]:
# Save PCA results for use in Streamlit
pca_results = {
    'transformed_data': pca_df,
    'explained_variance': variance_df,
    'components_matrix': components_df,
    'feature_names': features_for_pca,
    'pca_model': pca,
    'scaler': scaler
}


## Analyse ACP pour la classification binaire (normale vs attaque)

Pour compléter notre analyse, nous allons également examiner comment l'ACP se comporte dans le cas d'une classification binaire simple, en distinguant uniquement les états normaux des attaques. Cette approche nous permettra de mieux comprendre la séparation globale entre le comportement normal et anormal du système.

In [14]:
# Create visualizations for binary classification (Label_n)
# 2D Scatter plot colored by Label_n
fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='Label_n',
    title='Résultats ACP - Deux Premières Composantes (Classification Binaire)',
    hover_data=['source'],
    color_discrete_map={True: 'red', False: 'blue'},
    labels={'Label_n': 'Attaque'}
)
fig.show()

# 3D Scatter plot for binary classification
fig = px.scatter_3d(
    pca_df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='Label_n',
    title='Résultats ACP - Trois Premières Composantes (Classification Binaire)',
    hover_data=['source'],
    color_discrete_map={True: 'red', False: 'blue'},
    labels={'Label_n': 'Attaque'}
)
fig.show()

## Analyse de la séparation binaire

Les visualisations ci-dessus nous permettent d'observer la séparation entre les états normaux et les attaques dans l'espace réduit par l'ACP. Cette représentation binaire peut être particulièrement utile pour :
- Évaluer la facilité de détection globale des attaques
- Identifier les zones de chevauchement potentielles entre comportement normal et anormal
- Repérer d'éventuels regroupements d'attaques dans l'espace des composantes principales

In [15]:
# Calculate centroids for normal and attack states
centroids = pca_df.groupby('Label_n')[['PC1', 'PC2', 'PC3']].mean()

# Create visualization with centroids
fig = go.Figure()

# Add scatter points
for label, color in [(False, 'blue'), (True, 'red')]:
    mask = pca_df['Label_n'] == label
    fig.add_trace(go.Scatter(
        x=pca_df[mask]['PC1'],
        y=pca_df[mask]['PC2'],
        mode='markers',
        name='Normal' if not label else 'Attaque',
        marker=dict(color=color, size=5, opacity=0.6)
    ))
    
    # Add centroid
    fig.add_trace(go.Scatter(
        x=[centroids.loc[label, 'PC1']],
        y=[centroids.loc[label, 'PC2']],
        mode='markers',
        name=f"Centroïde {'Normal' if not label else 'Attaque'}",
        marker=dict(
            color=color,
            size=15,
            symbol='x',
            line=dict(width=2, color='black')
        )
    ))

fig.update_layout(
    title='Résultats ACP avec Centroïdes (Classification Binaire)',
    xaxis_title='PC1',
    yaxis_title='PC2'
)
fig.show()

In [16]:

# Update PCA results dictionary with binary analysis
pca_results.update({
    'binary_centroids': centroids,
    'binary_vis_data': {
        'pca_df': pca_df,  # We already have the transformed data with Label_n
        'color_map': {True: 'red', False: 'blue'},
        'label_map': {True: 'Attaque', False: 'Normal'}
    }
})

# Save updated results
db['pca_results_phy'] = pca_results