In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import shap
import lime
import os
import lime.lime_tabular
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance

class ActivityXAIAnalyzer:
    def __init__(self, X, cluster_labels):
        """
        Initialize the XAI analyzer with data and cluster labels
        
        Parameters:
        X (array-like): Feature matrix from micro-Doppler signatures
        cluster_labels (array-like): Cluster assignments from part (a)
        """
        self.X = X
        self.cluster_labels = cluster_labels
        self.scaler = StandardScaler()
        self.X_scaled = self.scaler.fit_transform(X)
        self.rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.rf_model.fit(self.X_scaled, self.cluster_labels)
        
    def analyze_shap_values(self):
        """
        Use SHAP values to explain feature importance for each cluster
        """
        # Calculate SHAP values
        explainer = shap.TreeExplainer(self.rf_model)
        shap_values = explainer.shap_values(self.X_scaled)
        
        # For each cluster, get the most important features
        feature_importance_per_cluster = {}
        for cluster_idx in range(len(np.unique(self.cluster_labels))):
            # Get mean absolute SHAP values for this cluster
            cluster_importance = np.abs(shap_values[cluster_idx]).mean(0)
            # Get top features indices
            top_features_idx = np.argsort(cluster_importance)[-5:]  # Top 5 features
            feature_importance_per_cluster[f'Cluster_{cluster_idx}'] = {
                'indices': top_features_idx,
                'importance_values': cluster_importance[top_features_idx]
            }
        
        return feature_importance_per_cluster
    
    def predict_proba_wrapper(self, X):
        """
        Wrapper for model predictions to ensure correct format for LIME
        """
        if len(X.shape) == 1:
            X = X.reshape(1, -1)
        return self.rf_model.predict_proba(X)
    
    def analyze_lime(self, sample_indices=None):
        """
        Use LIME to explain individual instances from each cluster
        """
        if sample_indices is None:
            # Randomly select one sample from each cluster
            sample_indices = []
            for cluster in np.unique(self.cluster_labels):
                cluster_samples = np.where(self.cluster_labels == cluster)[0]
                sample_indices.append(np.random.choice(cluster_samples))
        
        # Initialize LIME explainer
        explainer = lime.lime_tabular.LimeTabularExplainer(
            self.X_scaled,
            mode='classification',
            feature_names=[f'feature_{i}' for i in range(self.X.shape[1])],
            class_names=[f'Cluster_{i}' for i in range(len(np.unique(self.cluster_labels)))]
        )
        
        lime_explanations = {}
        for idx in sample_indices:
            explanation = explainer.explain_instance(
                self.X_scaled[idx], 
                self.predict_proba_wrapper,
                num_features=5  # Explain top 5 features
            )
            lime_explanations[f'Sample_{idx}'] = explanation.as_list()
            
        return lime_explanations
    
    def analyze_permutation_importance(self):
        """
        Calculate permutation importance for features
        """
        result = permutation_importance(
            self.rf_model, self.X_scaled, self.cluster_labels,
            n_repeats=10,
            random_state=42
        )
        
        return {
            'importances_mean': result.importances_mean,
            'importances_std': result.importances_std,
            'top_features': np.argsort(result.importances_mean)[-5:]  # Top 5 features
        }
    
    def get_consensus_features(self):
        """
        Combine results from all methods to get consensus important features
        """
        shap_results = self.analyze_shap_values()
        lime_results = self.analyze_lime()
        perm_results = self.analyze_permutation_importance()
        
        # Combine and analyze overlap between methods
        consensus_features = {}
        for cluster in np.unique(self.cluster_labels):
            cluster_key = f'Cluster_{cluster}'
            
            # Get important features from each method
            shap_features = set(shap_results[cluster_key]['indices'])
            perm_features = set(perm_results['top_features'])
            
            # Find features that appear in multiple methods
            consensus = shap_features.intersection(perm_features)
            consensus_features[cluster_key] = list(consensus)
            
        return consensus_features

    def visualize_feature_importance(self, method='shap'):
        """
        Visualize feature importance for each cluster
        """
        if method == 'shap':
            results = self.analyze_shap_values()
            plt.figure(figsize=(12, 6))
            for cluster, data in results.items():
                plt.bar(
                    [f'Feature_{i}' for i in data['indices']],
                    data['importance_values'],
                    label=cluster
                )
            plt.title('SHAP Feature Importance by Cluster')
            plt.xlabel('Features')
            plt.ylabel('Importance')
            plt.xticks(rotation=45)
            plt.legend()
            plt.tight_layout()
            plt.show()

In [2]:
def load_data_for_xai(base_path='data/a_cluster'):
    """
    Load and format data for XAI analysis
    
    Parameters:
    base_path (str): Path to the cluster directory
    
    Returns:
    X (np.array): Feature matrix where each row is a sample
    cluster_labels (np.array): Cluster labels for each sample
    """
    all_samples = []
    all_labels = []
    
    # Get all cluster directories
    cluster_dirs = sorted([d for d in os.listdir(base_path) if d.startswith('cluster_')])
    
    for cluster_idx, cluster_dir in enumerate(cluster_dirs):
        cluster_path = os.path.join(base_path, cluster_dir)
        
        if os.path.isdir(cluster_path):
            # Get all .npy files in the cluster directory
            npy_files = sorted([f for f in os.listdir(cluster_path) if f.endswith('.npy')])
            
            for npy_file in npy_files:
                file_path = os.path.join(cluster_path, npy_file)
                data = np.load(file_path)
                data = data[:409, :]
                # Flatten or reshape data if needed
                data_flat = data.flatten()  # or use appropriate reshaping
                all_samples.append(data_flat)
                all_labels.append(cluster_idx)
    
    X = np.array(all_samples)
    cluster_labels = np.array(all_labels)
    
    return X, cluster_labels

In [3]:
# Assuming X is your feature matrix and cluster_labels are from part (a)
X, cluster_labels = load_data_for_xai()
xai_analyzer = ActivityXAIAnalyzer(X, cluster_labels)

# Get consensus important features
important_features = xai_analyzer.get_consensus_features()

# Visualize results
xai_analyzer.visualize_feature_importance()