In [2]:
!pip install shap

Defaulting to user installation because normal site-packages is not writeable
Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Installing collected packages: slicer, cloudpickle, shap
Successfully installed cloudpickle-3.1.1 shap-0.46.0 slicer-0.0.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pi

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
import shap

# Load the dataset
file_path = "processed_dataset_with_clusters.csv"
df = pd.read_csv(file_path)

# Separate features and target variables
feature_columns = df.columns[:-5]  # Excluding the last 5 columns (target variables)
target_columns = df.columns[-5:]  # Last 5 columns are target variables
features = df[feature_columns]

# Normalize features using StandardScaler (Z-score normalization)
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)
df_normalized = pd.DataFrame(normalized_features, columns=feature_columns)

# Define sensor modalities
modalities = {
    "HRV": [col for col in feature_columns if "hrv" in col],
    "ACC": [col for col in feature_columns if "acc" in col],
    "EDA": [col for col in feature_columns if "eda" in col]
}

# Initialize results dictionary
feature_analysis_results = {}
modality_importance = {}

# Analyze each target variable
for target_variable in target_columns:
    # Compute correlation with the target variable
    correlations = df_normalized.corrwith(df[target_variable]).abs().sort_values(ascending=False)

    # Compute mutual information between features and target
    mi_scores = mutual_info_regression(df_normalized, df[target_variable], random_state=42)

    # Train a simplified Random Forest model to assess feature importance
    rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
    rf.fit(df_normalized, df[target_variable])
    feature_importance = rf.feature_importances_

    # SHAP analysis
    explainer = shap.TreeExplainer(rf)
    shap_values = explainer.shap_values(df_normalized)

    # Store results in a DataFrame
    feature_analysis = pd.DataFrame({
        "Feature": feature_columns,
        "Correlation": correlations.values,
        "RandomForestImportance": feature_importance,
        "MutualInfo": mi_scores,
        "SHAP_Mean": np.abs(shap_values).mean(axis=0)
    })

    # Sort by Random Forest feature importance
    feature_analysis = feature_analysis.sort_values(by="RandomForestImportance", ascending=False)
    feature_analysis_results[target_variable] = feature_analysis

    # Aggregate feature importance by modality
    modality_scores = {
        modality: feature_analysis[feature_analysis["Feature"].isin(features)]["RandomForestImportance"].sum() for
        modality, features in modalities.items()}
    modality_importance[target_variable] = modality_scores

# Display results
for target, analysis_df in feature_analysis_results.items():
    print(f"Feature Importance for {target}:")
    print(analysis_df.head(10))  # Show top 10 most important features
    print("\n")

# Display modality-level importance
modality_importance_df = pd.DataFrame(modality_importance)
print("Modality-Level Feature Importance:")
print(modality_importance_df)

# Compute and visualize correlation matrix between different modalities
modality_corr = df_normalized[[col for col_list in modalities.values() for col in col_list]].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(modality_corr, cmap='coolwarm', center=0)
plt.title("Correlation Matrix Between Modalities")
plt.show()

# Optional: Visualize feature distributions before normalization
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle("Feature Distributions Before Normalization", fontsize=16)

# Selecting a subset of features to plot
sample_features = features.columns[:9]

for i, col in enumerate(sample_features):
    ax = axes[i // 3, i % 3]
    sns.histplot(features[col], bins=50, kde=True, ax=ax)
    ax.set_title(col)

plt.tight_layout()
plt.show()


  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
