In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import f_classif, mutual_info_classif, RFE
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from mpl_toolkits.mplot3d import Axes3D
import shap
import warnings
warnings.filterwarnings('ignore')

# Function to create directory if it doesn't exist
def create_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

def load_feature_data(feature_file_path):
    """Load feature data from CSV file"""
    try:
        df = pd.read_csv(feature_file_path)
        print(f"Loaded data with {df.shape[0]} samples and {df.shape[1]} features")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def prepare_data_for_modeling(df):
    """Prepare data for modeling by separating features and target"""
    # Extract character name as target
    X = df.copy()
    
    # Assuming 'char_name' contains the Bengali character labels
    y = X['char_name'] if 'char_name' in X.columns else X['file_name']
    
    # Remove non-feature columns
    non_feature_cols = ['file_name', 'char_name']
    X = X.drop([col for col in non_feature_cols if col in X.columns], axis=1)
    
    # Encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Handle any remaining non-numeric columns
    for col in X.columns:
        if X[col].dtype == 'object':
            X = X.drop(col, axis=1)
    
    # Fill NaN values with column means
    X = X.fillna(X.mean())
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X, X_scaled, y, y_encoded, label_encoder, X.columns

def plot_correlation_heatmap(X, feature_names, output_dir):
    """Generate a correlation heatmap of features."""
    # Create DataFrame with feature names
    df_corr = pd.DataFrame(X, columns=feature_names)
    
    # Calculate correlation matrix
    corr_matrix = df_corr.corr()
    
    # Create mask for the upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    # Set up the matplotlib figure
    plt.figure(figsize=(max(10, len(feature_names) * 0.5), max(8, len(feature_names) * 0.5)))

    # Plot the heatmap
    sns.heatmap(
        corr_matrix,
        mask=mask,
        cmap='coolwarm',
        annot=False,
        square=True,
        linewidths=.5,
        cbar_kws={"shrink": .5},
        xticklabels=corr_matrix.columns,
        yticklabels=corr_matrix.columns
    )
    
    # Improve layout
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(rotation=0, fontsize=10)
    plt.title('Feature Correlation Heatmap', fontsize=18, pad=20)
    plt.tight_layout()

    # Save the plot
    output_path = os.path.join(output_dir, 'correlation_heatmap.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved correlation heatmap to {output_path}")

def plot_random_forest_importance(X_scaled, y_encoded, feature_names, output_dir):
    """Generate a random forest feature importance plot"""
    # Train Random Forest model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y_encoded)
    
    # Get feature importances
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Select top 30 features for readability
    top_n = min(30, len(feature_names))
    top_indices = indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Random Forest Feature Importance', fontsize=16)
    plt.bar(range(top_n), importances[top_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'random_forest_importance.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved random forest importance plot to {output_path}")
    
    return rf  # Return the model for later use

def plot_pca_scatter(X_scaled, y_encoded, label_encoder, output_dir):
    """Generate 2D and 3D PCA scatter plots"""
    # 2D PCA
    pca_2d = PCA(n_components=2)
    X_pca_2d = pca_2d.fit_transform(X_scaled)
    
    # Plot 2D PCA
    plt.figure(figsize=(12, 10))
    
    # Get unique classes
    unique_classes = np.unique(y_encoded)
    
    # Create a colormap
    cmap = plt.cm.get_cmap('tab20', len(unique_classes))
    
    # Plot each class
    for i, class_idx in enumerate(unique_classes):
        plt.scatter(X_pca_2d[y_encoded == class_idx, 0], 
                    X_pca_2d[y_encoded == class_idx, 1],
                    color=cmap(i), 
                    alpha=0.7,
                    label=f"{label_encoder.inverse_transform([class_idx])[0]}")
    
    plt.title('2D PCA of Features', fontsize=16)
    plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)')
    
    # Add legend if not too many classes
    if len(unique_classes) <= 20:
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    # Save 2D plot
    output_path_2d = os.path.join(output_dir, 'pca_2d_scatter.png')
    plt.savefig(output_path_2d, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved 2D PCA scatter plot to {output_path_2d}")
    
    # 3D PCA
    pca_3d = PCA(n_components=3)
    X_pca_3d = pca_3d.fit_transform(X_scaled)
    
    # Plot 3D PCA
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot each class
    for i, class_idx in enumerate(unique_classes):
        ax.scatter(X_pca_3d[y_encoded == class_idx, 0],
                   X_pca_3d[y_encoded == class_idx, 1],
                   X_pca_3d[y_encoded == class_idx, 2],
                   color=cmap(i),
                   alpha=0.7,
                   label=f"{label_encoder.inverse_transform([class_idx])[0]}")
    
    ax.set_title('3D PCA of Features', fontsize=16)
    ax.set_xlabel(f'PC1 ({pca_3d.explained_variance_ratio_[0]:.2%} variance)')
    ax.set_ylabel(f'PC2 ({pca_3d.explained_variance_ratio_[1]:.2%} variance)')
    ax.set_zlabel(f'PC3 ({pca_3d.explained_variance_ratio_[2]:.2%} variance)')
    
    # Add legend if not too many classes
    if len(unique_classes) <= 20:
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    # Save 3D plot
    output_path_3d = os.path.join(output_dir, 'pca_3d_scatter.png')
    plt.savefig(output_path_3d, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved 3D PCA scatter plot to {output_path_3d}")

def plot_permutation_importance(model, X_scaled, y_encoded, feature_names, output_dir):
    """Generate permutation feature importance plot"""
    # Calculate permutation importance
    result = permutation_importance(model, X_scaled, y_encoded, n_repeats=10, random_state=42)
    perm_importance = result.importances_mean
    
    # Sort features by importance
    indices = np.argsort(perm_importance)[::-1]
    
    # Select top 30 features for readability
    top_n = min(30, len(feature_names))
    top_indices = indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Permutation Feature Importance', fontsize=16)
    plt.bar(range(top_n), perm_importance[top_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'permutation_importance.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved permutation importance plot to {output_path}")

def plot_shap_summary(model, X_scaled, feature_names, output_dir):
    """Generate SHAP values summary plot"""
    # Create a smaller subset if dataset is large (SHAP can be computationally intensive)
    max_samples = min(500, X_scaled.shape[0])
    X_sample = X_scaled[:max_samples]
    
    try:
        # SHAP explainer for Random Forest
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_sample)
        
        # If multi-class, take the mean of absolute SHAP values across classes
        if isinstance(shap_values, list):
            # Average across all classes for feature importance
            shap_values_mean = np.abs(np.array(shap_values)).mean(axis=0)
            
            plt.figure(figsize=(12, 10))
            
            # Create DataFrame for SHAP summary plot
            shap_df = pd.DataFrame(shap_values_mean, columns=feature_names)
            
            # Get feature importance based on mean absolute SHAP values
            feature_importance = np.mean(np.abs(shap_values_mean), axis=0)
            sorted_idx = np.argsort(feature_importance)
            
            # Select top 30 features
            top_n = min(30, len(feature_names))
            top_idx = sorted_idx[-top_n:]
            
            # Plot
            plt.barh(range(top_n), feature_importance[top_idx])
            plt.yticks(range(top_n), [feature_names[i] for i in top_idx])
            plt.xlabel('Mean |SHAP value|')
            plt.title('Feature Importance based on SHAP Values')
            
            # Save the plot
            output_path = os.path.join(output_dir, 'shap_summary.png')
            plt.savefig(output_path, dpi=300, bbox_inches='tight')
            plt.close()
            
            # Also create a traditional SHAP summary plot for a single class if not too many features
            if len(feature_names) <= 30:
                plt.figure(figsize=(12, 10))
                # Use the first class for the summary plot
                shap.summary_plot(shap_values[0], pd.DataFrame(X_sample, columns=feature_names), 
                                 show=False, max_display=20)
                plt.title('SHAP Summary Plot (First Class)', fontsize=16)
                plt.tight_layout()
                # Save the plot
                output_path = os.path.join(output_dir, 'shap_summary_first_class.png')
                plt.savefig(output_path, dpi=300, bbox_inches='tight')
                plt.close()
        else:
            # Binary classification case
            plt.figure(figsize=(12, 10))
            shap.summary_plot(shap_values, pd.DataFrame(X_sample, columns=feature_names), 
                             show=False, max_display=20)
            plt.title('SHAP Summary Plot', fontsize=16)
            plt.tight_layout()
            
            # Save the plot
            output_path = os.path.join(output_dir, 'shap_summary.png')
            plt.savefig(output_path, dpi=300, bbox_inches='tight')
            plt.close()
        
        print(f"Saved SHAP summary plot to {output_dir}")
    except Exception as e:
        print(f"Error creating SHAP plot: {e}")

def plot_univariate_importance(X_scaled, y_encoded, feature_names, output_dir):
    """Generate univariate feature importance plots using F-test and Mutual Information"""
    # F-test
    f_values, p_values = f_classif(X_scaled, y_encoded)
    
    # Sort features by F-values
    f_indices = np.argsort(f_values)[::-1]
    
    # Select top 30 features for readability
    top_n = min(30, len(feature_names))
    top_f_indices = f_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Univariate Feature Importance (F-test)', fontsize=16)
    plt.bar(range(top_n), f_values[top_f_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_f_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the F-test plot
    output_path_f = os.path.join(output_dir, 'f_test_importance.png')
    plt.savefig(output_path_f, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved F-test importance plot to {output_path_f}")
    
    # Mutual Information
    mi_values = mutual_info_classif(X_scaled, y_encoded, random_state=42)
    
    # Sort features by MI values
    mi_indices = np.argsort(mi_values)[::-1]
    
    # Select top 30 features
    top_mi_indices = mi_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Univariate Feature Importance (Mutual Information)', fontsize=16)
    plt.bar(range(top_n), mi_values[top_mi_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_mi_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the MI plot
    output_path_mi = os.path.join(output_dir, 'mutual_info_importance.png')
    plt.savefig(output_path_mi, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved Mutual Information importance plot to {output_path_mi}")
    
    return f_values, mi_values

def plot_rfe_ranking(X_scaled, y_encoded, feature_names, output_dir):
    """Generate Recursive Feature Elimination (RFE) ranking plot"""
    # Create a classifier for RFE
    rf = RandomForestClassifier(n_estimators=50, random_state=42)
    
    # Use a subset of features if there are too many
    n_features_to_select = min(30, len(feature_names))
    
    # Run RFE
    rfe = RFE(estimator=rf, n_features_to_select=n_features_to_select, step=1)
    rfe.fit(X_scaled, y_encoded)
    
    # Get feature ranking (the smaller the number, the more important the feature)
    ranking = rfe.ranking_
    
    # Sort features by ranking
    ranking_indices = np.argsort(ranking)
    
    # Select top 30 features
    top_n = min(30, len(feature_names))
    top_ranking_indices = ranking_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Recursive Feature Elimination (RFE) Ranking', fontsize=16)
    plt.bar(range(top_n), [1/r if r > 0 else float('inf') for r in ranking[top_ranking_indices]], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_ranking_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.ylabel('1/Ranking (higher is more important)')
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'rfe_ranking.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved RFE ranking plot to {output_path}")

def plot_lasso_coefficients(X_scaled, y_encoded, feature_names, output_dir):
    """Generate LASSO coefficient plot for sparse feature selection"""
    # For multiclass, use one-vs-rest approach
    n_classes = len(np.unique(y_encoded))
    
    # Create binary target variables for One-vs-Rest
    y_binary = np.zeros((len(y_encoded), n_classes))
    for i in range(n_classes):
        y_binary[:, i] = (y_encoded == i).astype(int)
    
    # Train LASSO model for each class
    alpha = 0.01  # Adjust if needed
    coefs = np.zeros((n_classes, X_scaled.shape[1]))
    
    for i in range(n_classes):
        # Train LASSO for this class
        lasso = Lasso(alpha=alpha, max_iter=10000)
        lasso.fit(X_scaled, y_binary[:, i])
        coefs[i, :] = lasso.coef_
    
    # Compute average absolute coefficient for each feature across classes
    avg_abs_coefs = np.mean(np.abs(coefs), axis=0)
    
    # Sort features by coefficient magnitude
    coef_indices = np.argsort(avg_abs_coefs)[::-1]
    
    # Select top features
    top_n = min(30, len(feature_names))
    top_coef_indices = coef_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('LASSO Coefficient Magnitude (Averaged Across Classes)', fontsize=16)
    plt.bar(range(top_n), avg_abs_coefs[top_coef_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_coef_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'lasso_coefficients.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved LASSO coefficient plot to {output_path}")

def create_feature_importance_summary(feature_names, rf_importances, perm_importances, 
                                     f_values, mi_values, output_dir):
    """Create a summary CSV of feature importances from different methods"""
    # Create a DataFrame to hold importance values
    importance_df = pd.DataFrame({'Feature': feature_names,
                                  'RandomForest': rf_importances,
                                  'Permutation': perm_importances,
                                  'F_Test': f_values,
                                  'MutualInfo': mi_values})
    
    # Sort by average rank across methods
    # First, create rank columns (higher value = more important)
    for col in ['RandomForest', 'Permutation', 'F_Test', 'MutualInfo']:
        importance_df[f'{col}_Rank'] = importance_df[col].rank(ascending=False)
    
    # Calculate average rank
    rank_cols = [c for c in importance_df.columns if c.endswith('_Rank')]
    importance_df['AvgRank'] = importance_df[rank_cols].mean(axis=1)
    
    # Sort by average rank
    importance_df = importance_df.sort_values('AvgRank')
    
    # Save to CSV
    output_path = os.path.join(output_dir, 'feature_importance_summary.csv')
    importance_df.to_csv(output_path, index=False)
    print(f"Saved feature importance summary to {output_path}")

def main():
    """Main function to coordinate visualization tasks"""
    # Define input and output directories
    feature_file = "featureExtraction/all_features.csv"
    output_dir = "visualizations"
    
    # Create output directory
    create_dir_if_not_exists(output_dir)
    
    # Load feature data
    df = load_feature_data(feature_file)
    if df is None:
        return
    
    # Prepare data for modeling
    X, X_scaled, y, y_encoded, label_encoder, feature_names = prepare_data_for_modeling(df)
    
    # Plot correlation heatmap
    print("\nGenerating correlation heatmap...")
    plot_correlation_heatmap(X, feature_names, output_dir)
    
    # Plot Random Forest feature importance
    print("\nGenerating random forest feature importance...")
    rf_model = plot_random_forest_importance(X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot PCA scatter plots
    print("\nGenerating PCA scatter plots...")
    plot_pca_scatter(X_scaled, y_encoded, label_encoder, output_dir)
    
    # Plot permutation feature importance
    print("\nGenerating permutation feature importance...")
    result = permutation_importance(rf_model, X_scaled, y_encoded, n_repeats=10, random_state=42)
    perm_importances = result.importances_mean
    plot_permutation_importance(rf_model, X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot SHAP values summary
    print("\nGenerating SHAP values summary plot...")
    plot_shap_summary(rf_model, X_scaled, feature_names, output_dir)
    
    # Plot univariate feature importance
    print("\nGenerating univariate feature importance plots...")
    f_values, mi_values = plot_univariate_importance(X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot RFE ranking
    print("\nGenerating Recursive Feature Elimination (RFE) ranking plot...")
    plot_rfe_ranking(X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot LASSO coefficients
    print("\nGenerating LASSO coefficient plot...")
    plot_lasso_coefficients(X_scaled, y_encoded, feature_names, output_dir)
    
    # Create feature importance summary
    print("\nCreating feature importance summary...")
    create_feature_importance_summary(feature_names, 
                                     rf_model.feature_importances_,
                                     perm_importances,
                                     f_values,
                                     mi_values,
                                     output_dir)
    
    print("\nAll visualizations completed successfully!")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'shap'

In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import f_classif, mutual_info_classif, RFE
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings('ignore')

# Function to create directory if it doesn't exist
def create_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

def load_feature_data(feature_file_path):
    """Load feature data from CSV file"""
    try:
        df = pd.read_csv(feature_file_path)
        print(f"Loaded data with {df.shape[0]} samples and {df.shape[1]} features")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def prepare_data_for_modeling(df):
    """Prepare data for modeling by separating features and target"""
    # Extract character name as target
    X = df.copy()
    
    # Assuming 'char_name' contains the Bengali character labels
    y = X['char_name'] if 'char_name' in X.columns else X['file_name']
    
    # Remove non-feature columns
    non_feature_cols = ['file_name', 'char_name']
    X = X.drop([col for col in non_feature_cols if col in X.columns], axis=1)
    
    # Encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Handle any remaining non-numeric columns
    for col in X.columns:
        if X[col].dtype == 'object':
            X = X.drop(col, axis=1)
    
    # Fill NaN values with column means
    X = X.fillna(X.mean())
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X, X_scaled, y, y_encoded, label_encoder, X.columns

def plot_correlation_heatmap(X, feature_names, output_dir):
    """Generate a correlation heatmap of features"""
    plt.figure(figsize=(16, 14))
    
    # Create DataFrame for correlation
    df_corr = pd.DataFrame(X, columns=feature_names)
    
    # Calculate correlation matrix
    corr_matrix = df_corr.corr()
    
    # Create mask for the upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    # Plot heatmap
    sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', annot=False, 
                square=True, linewidths=.5, cbar_kws={"shrink": .5})
    
    plt.title('Feature Correlation Heatmap', fontsize=16)
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'correlation_heatmap.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved correlation heatmap to {output_path}")

def plot_random_forest_importance(X_scaled, y_encoded, feature_names, output_dir):
    """Generate a random forest feature importance plot"""
    # Train Random Forest model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y_encoded)
    
    # Get feature importances
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Select top 30 features for readability
    top_n = min(30, len(feature_names))
    top_indices = indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Random Forest Feature Importance', fontsize=16)
    plt.bar(range(top_n), importances[top_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'random_forest_importance.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved random forest importance plot to {output_path}")
    
    return rf  # Return the model for later use

def plot_pca_scatter(X_scaled, y_encoded, label_encoder, output_dir):
    """Generate 2D and 3D PCA scatter plots"""
    # 2D PCA
    pca_2d = PCA(n_components=2)
    X_pca_2d = pca_2d.fit_transform(X_scaled)
    
    # Plot 2D PCA
    plt.figure(figsize=(12, 10))
    
    # Get unique classes
    unique_classes = np.unique(y_encoded)
    
    # Create a colormap
    cmap = plt.cm.get_cmap('tab20', len(unique_classes))
    
    # Plot each class
    for i, class_idx in enumerate(unique_classes):
        plt.scatter(X_pca_2d[y_encoded == class_idx, 0], 
                    X_pca_2d[y_encoded == class_idx, 1],
                    color=cmap(i), 
                    alpha=0.7,
                    label=f"{label_encoder.inverse_transform([class_idx])[0]}")
    
    plt.title('2D PCA of Features', fontsize=16)
    plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)')
    
    # Add legend if not too many classes
    if len(unique_classes) <= 20:
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    # Save 2D plot
    output_path_2d = os.path.join(output_dir, 'pca_2d_scatter.png')
    plt.savefig(output_path_2d, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved 2D PCA scatter plot to {output_path_2d}")
    
    # 3D PCA
    pca_3d = PCA(n_components=3)
    X_pca_3d = pca_3d.fit_transform(X_scaled)
    
    # Plot 3D PCA
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot each class
    for i, class_idx in enumerate(unique_classes):
        ax.scatter(X_pca_3d[y_encoded == class_idx, 0],
                   X_pca_3d[y_encoded == class_idx, 1],
                   X_pca_3d[y_encoded == class_idx, 2],
                   color=cmap(i),
                   alpha=0.7,
                   label=f"{label_encoder.inverse_transform([class_idx])[0]}")
    
    ax.set_title('3D PCA of Features', fontsize=16)
    ax.set_xlabel(f'PC1 ({pca_3d.explained_variance_ratio_[0]:.2%} variance)')
    ax.set_ylabel(f'PC2 ({pca_3d.explained_variance_ratio_[1]:.2%} variance)')
    ax.set_zlabel(f'PC3 ({pca_3d.explained_variance_ratio_[2]:.2%} variance)')
    
    # Add legend if not too many classes
    if len(unique_classes) <= 20:
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    # Save 3D plot
    output_path_3d = os.path.join(output_dir, 'pca_3d_scatter.png')
    plt.savefig(output_path_3d, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved 3D PCA scatter plot to {output_path_3d}")

def plot_permutation_importance(model, X_scaled, y_encoded, feature_names, output_dir):
    """Generate permutation feature importance plot"""
    # Calculate permutation importance
    result = permutation_importance(model, X_scaled, y_encoded, n_repeats=10, random_state=42)
    perm_importance = result.importances_mean
    
    # Sort features by importance
    indices = np.argsort(perm_importance)[::-1]
    
    # Select top 30 features for readability
    top_n = min(30, len(feature_names))
    top_indices = indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Permutation Feature Importance', fontsize=16)
    plt.bar(range(top_n), perm_importance[top_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'permutation_importance.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved permutation importance plot to {output_path}")
    
    return perm_importance

def plot_feature_importance_heatmap(model, X_scaled, y_encoded, feature_names, output_dir):
    """Generate a feature importance heatmap as an alternative to SHAP plots"""
    # Get the number of classes
    n_classes = len(np.unique(y_encoded))
    
    # For small number of classes, we can create class-specific importances
    if n_classes <= 10:
        class_importances = {}
        
        # Compute separate permutation importance for each class
        for class_idx in range(n_classes):
            # Create binary target for this class (one-vs-rest)
            y_binary = (y_encoded == class_idx).astype(int)
            
            # Calculate permutation importance for this binary problem
            result = permutation_importance(
                model, X_scaled, y_binary, 
                n_repeats=5, random_state=42
            )
            class_importances[class_idx] = result.importances_mean
        
        # Get top N features by average importance across classes
        avg_importances = np.zeros(len(feature_names))
        for class_idx in range(n_classes):
            avg_importances += class_importances[class_idx]
        avg_importances /= n_classes
        
        # Get top features
        top_n = min(20, len(feature_names))
        top_indices = np.argsort(avg_importances)[::-1][:top_n]
        
        # Create a matrix for heatmap (features x classes)
        importance_matrix = np.zeros((top_n, n_classes))
        for i, feature_idx in enumerate(top_indices):
            for class_idx in range(n_classes):
                importance_matrix[i, class_idx] = class_importances[class_idx][feature_idx]
        
        # Plot heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(importance_matrix, annot=True, cmap='viridis', fmt='.3f',
                   yticklabels=[feature_names[i] for i in top_indices],
                   xticklabels=[f'Class {i}' for i in range(n_classes)])
        plt.title('Feature Importance by Class (Permutation Importance)', fontsize=16)
        plt.tight_layout()
        
        # Save the plot
        output_path = os.path.join(output_dir, 'feature_importance_by_class.png')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved feature importance by class heatmap to {output_path}")
    
    # Feature importance variation plot (alternative to SHAP summary)
    # This shows mean and standard deviation of feature importance
    result = permutation_importance(model, X_scaled, y_encoded, n_repeats=20, random_state=42)
    
    # Get top N features
    top_n = min(30, len(feature_names))
    perm_sorted_idx = result.importances_mean.argsort()[::-1]
    top_indices = perm_sorted_idx[:top_n]
    
    # Create plot showing mean and std of importance
    plt.figure(figsize=(12, 8))
    plt.bar(range(top_n), result.importances_mean[top_indices],
           yerr=result.importances_std[top_indices],
           align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_indices], rotation=90)
    plt.title('Feature Importance with Variation (Alternative to SHAP)', fontsize=16)
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'feature_importance_with_variation.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved feature importance variation plot to {output_path}")
    
    # Plot partial dependence plots for top features (another SHAP alternative)
    try:
        from sklearn.inspection import plot_partial_dependence
        
        # Use top 5 features for partial dependence
        top_5_indices = top_indices[:5]
        top_5_features = [feature_names[i] for i in top_5_indices]
        
        # Create figure
        fig, ax = plt.subplots(figsize=(12, 10))
        
        # Plot partial dependence
        display = plot_partial_dependence(
            model, X_scaled, features=top_5_indices, 
            feature_names=feature_names, n_jobs=2,
            ax=ax, random_state=42
        )
        
        plt.suptitle('Partial Dependence of Top Features (SHAP Alternative)', fontsize=16)
        plt.tight_layout()
        
        # Save the plot
        output_path = os.path.join(output_dir, 'partial_dependence.png')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved partial dependence plot to {output_path}")
    except Exception as e:
        print(f"Could not create partial dependence plot: {e}")

def plot_univariate_importance(X_scaled, y_encoded, feature_names, output_dir):
    """Generate univariate feature importance plots using F-test and Mutual Information"""
    # F-test
    f_values, p_values = f_classif(X_scaled, y_encoded)
    
    # Sort features by F-values
    f_indices = np.argsort(f_values)[::-1]
    
    # Select top 30 features for readability
    top_n = min(30, len(feature_names))
    top_f_indices = f_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Univariate Feature Importance (F-test)', fontsize=16)
    plt.bar(range(top_n), f_values[top_f_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_f_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the F-test plot
    output_path_f = os.path.join(output_dir, 'f_test_importance.png')
    plt.savefig(output_path_f, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved F-test importance plot to {output_path_f}")
    
    # Mutual Information
    mi_values = mutual_info_classif(X_scaled, y_encoded, random_state=42)
    
    # Sort features by MI values
    mi_indices = np.argsort(mi_values)[::-1]
    
    # Select top 30 features
    top_mi_indices = mi_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Univariate Feature Importance (Mutual Information)', fontsize=16)
    plt.bar(range(top_n), mi_values[top_mi_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_mi_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the MI plot
    output_path_mi = os.path.join(output_dir, 'mutual_info_importance.png')
    plt.savefig(output_path_mi, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved Mutual Information importance plot to {output_path_mi}")
    
    return f_values, mi_values

def plot_rfe_ranking(X_scaled, y_encoded, feature_names, output_dir):
    """Generate Recursive Feature Elimination (RFE) ranking plot"""
    # Create a classifier for RFE
    rf = RandomForestClassifier(n_estimators=50, random_state=42)
    
    # Use a subset of features if there are too many
    n_features_to_select = min(30, len(feature_names))
    
    # Run RFE
    rfe = RFE(estimator=rf, n_features_to_select=n_features_to_select, step=1)
    rfe.fit(X_scaled, y_encoded)
    
    # Get feature ranking (the smaller the number, the more important the feature)
    ranking = rfe.ranking_
    
    # Sort features by ranking
    ranking_indices = np.argsort(ranking)
    
    # Select top 30 features
    top_n = min(30, len(feature_names))
    top_ranking_indices = ranking_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('Recursive Feature Elimination (RFE) Ranking', fontsize=16)
    plt.bar(range(top_n), [1/r if r > 0 else float('inf') for r in ranking[top_ranking_indices]], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_ranking_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.ylabel('1/Ranking (higher is more important)')
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'rfe_ranking.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved RFE ranking plot to {output_path}")

def plot_lasso_coefficients(X_scaled, y_encoded, feature_names, output_dir):
    """Generate LASSO coefficient plot for sparse feature selection"""
    # For multiclass, use one-vs-rest approach
    n_classes = len(np.unique(y_encoded))
    
    # Create binary target variables for One-vs-Rest
    y_binary = np.zeros((len(y_encoded), n_classes))
    for i in range(n_classes):
        y_binary[:, i] = (y_encoded == i).astype(int)
    
    # Train LASSO model for each class
    alpha = 0.01  # Adjust if needed
    coefs = np.zeros((n_classes, X_scaled.shape[1]))
    
    for i in range(n_classes):
        # Train LASSO for this class
        lasso = Lasso(alpha=alpha, max_iter=10000)
        lasso.fit(X_scaled, y_binary[:, i])
        coefs[i, :] = lasso.coef_
    
    # Compute average absolute coefficient for each feature across classes
    avg_abs_coefs = np.mean(np.abs(coefs), axis=0)
    
    # Sort features by coefficient magnitude
    coef_indices = np.argsort(avg_abs_coefs)[::-1]
    
    # Select top features
    top_n = min(30, len(feature_names))
    top_coef_indices = coef_indices[:top_n]
    
    plt.figure(figsize=(12, 8))
    plt.title('LASSO Coefficient Magnitude (Averaged Across Classes)', fontsize=16)
    plt.bar(range(top_n), avg_abs_coefs[top_coef_indices], align='center')
    plt.xticks(range(top_n), [feature_names[i] for i in top_coef_indices], rotation=90)
    plt.xlim([-1, top_n])
    plt.tight_layout()
    
    # Save the plot
    output_path = os.path.join(output_dir, 'lasso_coefficients.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved LASSO coefficient plot to {output_path}")

def create_feature_importance_summary(feature_names, rf_importances, perm_importances, 
                                     f_values, mi_values, output_dir):
    """Create a summary CSV of feature importances from different methods"""
    # Create a DataFrame to hold importance values
    importance_df = pd.DataFrame({'Feature': feature_names,
                                  'RandomForest': rf_importances,
                                  'Permutation': perm_importances,
                                  'F_Test': f_values,
                                  'MutualInfo': mi_values})
    
    # Sort by average rank across methods
    # First, create rank columns (higher value = more important)
    for col in ['RandomForest', 'Permutation', 'F_Test', 'MutualInfo']:
        importance_df[f'{col}_Rank'] = importance_df[col].rank(ascending=False)
    
    # Calculate average rank
    rank_cols = [c for c in importance_df.columns if c.endswith('_Rank')]
    importance_df['AvgRank'] = importance_df[rank_cols].mean(axis=1)
    
    # Sort by average rank
    importance_df = importance_df.sort_values('AvgRank')
    
    # Save to CSV
    output_path = os.path.join(output_dir, 'feature_importance_summary.csv')
    importance_df.to_csv(output_path, index=False)
    print(f"Saved feature importance summary to {output_path}")

def main():
    """Main function to coordinate visualization tasks"""
    # Define input and output directories
    feature_file = "featureExtraction/all_features.csv"
    output_dir = "visualizations"
    
    # Create output directory
    create_dir_if_not_exists(output_dir)
    
    # Load feature data
    df = load_feature_data(feature_file)
    if df is None:
        return
    
    # Prepare data for modeling
    X, X_scaled, y, y_encoded, label_encoder, feature_names = prepare_data_for_modeling(df)
    
    # Plot correlation heatmap
    print("\nGenerating correlation heatmap...")
    plot_correlation_heatmap(X, feature_names, output_dir)
    
    # Plot Random Forest feature importance
    print("\nGenerating random forest feature importance...")
    rf_model = plot_random_forest_importance(X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot PCA scatter plots
    print("\nGenerating PCA scatter plots...")
    plot_pca_scatter(X_scaled, y_encoded, label_encoder, output_dir)
    
    # Plot permutation feature importance
    print("\nGenerating permutation feature importance...")
    perm_importances = plot_permutation_importance(rf_model, X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot feature importance alternatives to SHAP
    print("\nGenerating feature importance visualization (SHAP alternative)...")
    plot_feature_importance_heatmap(rf_model, X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot univariate feature importance
    print("\nGenerating univariate feature importance plots...")
    f_values, mi_values = plot_univariate_importance(X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot RFE ranking
    print("\nGenerating Recursive Feature Elimination (RFE) ranking plot...")
    plot_rfe_ranking(X_scaled, y_encoded, feature_names, output_dir)
    
    # Plot LASSO coefficients
    print("\nGenerating LASSO coefficient plot...")
    plot_lasso_coefficients(X_scaled, y_encoded, feature_names, output_dir)
    
    # Create feature importance summary
    print("\nCreating feature importance summary...")
    create_feature_importance_summary(feature_names, 
                                     rf_model.feature_importances_,
                                     perm_importances,
                                     f_values,
                                     mi_values,
                                     output_dir)
    
    print("\nAll visualizations completed successfully!")

if __name__ == "__main__":
    main()

Created directory: visualizations
Loaded data with 283 samples and 109 features

Generating correlation heatmap...
Saved correlation heatmap to visualizations\correlation_heatmap.png

Generating random forest feature importance...
Saved random forest importance plot to visualizations\random_forest_importance.png

Generating PCA scatter plots...
Saved 2D PCA scatter plot to visualizations\pca_2d_scatter.png
Saved 3D PCA scatter plot to visualizations\pca_3d_scatter.png

Generating permutation feature importance...
Saved permutation importance plot to visualizations\permutation_importance.png

Generating feature importance visualization (SHAP alternative)...
Saved feature importance variation plot to visualizations\feature_importance_with_variation.png
Could not create partial dependence plot: cannot import name 'plot_partial_dependence' from 'sklearn.inspection' (c:\ProgramData\anaconda3\Lib\site-packages\sklearn\inspection\__init__.py)

Generating univariate feature importance plots...

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings('ignore')

# Set the style for the plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_theme(style="white")

# Load the data
def load_data(file_path='all_features.csv'):
    """Load the feature data from CSV file."""
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded data with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(df):
    """Preprocess the data for analysis."""
    # Make a copy to avoid modifying the original DataFrame
    df_processed = df.copy()
    
    # Extract target variable from char_name column
    target = df_processed['char_name']
    
    # Remove non-feature columns
    features = df_processed.drop(['file_name', 'char_name'], axis=1)
    
    # Handle missing values
    features = features.fillna(features.mean())
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = pd.DataFrame(
        scaler.fit_transform(features),
        columns=features.columns
    )
    
    print(f"Data preprocessed: {features_scaled.shape[1]} features available.")
    return features_scaled, target

def generate_correlation_heatmap(features, output_file='correlation_heatmap.png'):
    """Generate and save correlation heatmap."""
    plt.figure(figsize=(20, 16))
    
    # Calculate correlation matrix
    corr_matrix = features.corr()
    
    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    # Generate heatmap
    sns.heatmap(
        corr_matrix, 
        mask=mask,
        cmap='coolwarm', 
        annot=False,
        linewidths=.5, 
        center=0,
        square=True,
        vmin=-1, 
        vmax=1
    )
    
    plt.title('Feature Correlation Heatmap', fontsize=18)
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Correlation heatmap saved to {output_file}")

def random_forest_importance(features, target, output_file='random_forest_importance.png'):
    """Generate and save Random Forest Feature Importance plot."""
    # Train a Random Forest model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(features, target)
    
    # Get feature importances
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Plot the top 30 features
    plt.figure(figsize=(12, 8))
    plt.bar(range(30), importances[indices][:30], align='center')
    plt.xticks(range(30), features.columns[indices][:30], rotation=90)
    plt.title('Random Forest Feature Importance', fontsize=16)
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Random Forest Feature Importance plot saved to {output_file}")

def generate_pca_plots(features, target, output_file_2d='pca_2d_scatter.png', output_file_3d='pca_3d_scatter.png'):
    """Generate and save PCA 2D and 3D scatter plots."""
    # Prepare colormap for different classes
    unique_targets = target.unique()
    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_targets)))
    
    # 2D PCA Plot
    pca_2d = PCA(n_components=2)
    pca_result_2d = pca_2d.fit_transform(features)
    
    # Create a DataFrame with PCA results
    pca_df_2d = pd.DataFrame(data={'PCA1': pca_result_2d[:, 0], 'PCA2': pca_result_2d[:, 1], 'Target': target})
    
    plt.figure(figsize=(12, 8))
    for i, target_class in enumerate(unique_targets):
        indices = pca_df_2d['Target'] == target_class
        plt.scatter(
            pca_df_2d.loc[indices, 'PCA1'], 
            pca_df_2d.loc[indices, 'PCA2'],
            c=[colors[i]],
            label=target_class,
            alpha=0.7
        )
    
    explained_var_2d = pca_2d.explained_variance_ratio_
    plt.title(f'PCA 2D Scatter Plot\nExplained Variance: {explained_var_2d[0]:.2%} (PC1), {explained_var_2d[1]:.2%} (PC2)', fontsize=16)
    plt.xlabel(f'Principal Component 1 ({explained_var_2d[0]:.2%})')
    plt.ylabel(f'Principal Component 2 ({explained_var_2d[1]:.2%})')
    plt.legend(title="Characters")
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(output_file_2d, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"PCA 2D scatter plot saved to {output_file_2d}")
    
    # 3D PCA Plot
    pca_3d = PCA(n_components=3)
    pca_result_3d = pca_3d.fit_transform(features)
    
    # Create a DataFrame with PCA results
    pca_df_3d = pd.DataFrame(
        data={'PCA1': pca_result_3d[:, 0], 
              'PCA2': pca_result_3d[:, 1], 
              'PCA3': pca_result_3d[:, 2], 
              'Target': target}
    )
    
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')
    
    for i, target_class in enumerate(unique_targets):
        indices = pca_df_3d['Target'] == target_class
        ax.scatter(
            pca_df_3d.loc[indices, 'PCA1'],
            pca_df_3d.loc[indices, 'PCA2'],
            pca_df_3d.loc[indices, 'PCA3'],
            c=[colors[i]],
            label=target_class,
            alpha=0.7
        )
    
    explained_var_3d = pca_3d.explained_variance_ratio_
    ax.set_title(f'PCA 3D Scatter Plot\nTotal Explained Variance: {sum(explained_var_3d):.2%}', fontsize=16)
    ax.set_xlabel(f'PC1 ({explained_var_3d[0]:.2%})')
    ax.set_ylabel(f'PC2 ({explained_var_3d[1]:.2%})')
    ax.set_zlabel(f'PC3 ({explained_var_3d[2]:.2%})')
    ax.legend(title="Characters")
    plt.tight_layout()
    plt.savefig(output_file_3d, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"PCA 3D scatter plot saved to {output_file_3d}")

def permutation_importance_plot(features, target, output_file='permutation_importance.png'):
    """Generate and save Permutation Feature Importance plot."""
    # Train a Random Forest model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(features, target)
    
    # Calculate permutation importance
    perm_importance = permutation_importance(rf, features, target, n_repeats=10, random_state=42)
    
    # Sort features by importance
    sorted_idx = perm_importance.importances_mean.argsort()[::-1]
    
    # Plot the top 30 features
    plt.figure(figsize=(12, 8))
    plt.boxplot(
        perm_importance.importances[sorted_idx][:30].T,
        vert=False,
        labels=features.columns[sorted_idx][:30]
    )
    plt.title("Permutation Feature Importance", fontsize=16)
    plt.xlabel("Decrease in Accuracy")
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Permutation Feature Importance plot saved to {output_file}")

def univariate_feature_importance(features, target, method='f_test', output_file='univariate_importance.png'):
    """Generate and save Univariate Feature Importance plot."""
    # Choose the scoring function
    if method == 'f_test':
        score_func = f_classif
        title = 'Univariate Feature Importance (F-test)'
    else:  # mutual_info
        score_func = mutual_info_classif
        title = 'Univariate Feature Importance (Mutual Information)'
    
    # Apply SelectKBest
    selector = SelectKBest(score_func=score_func, k='all')
    selector.fit(features, target)
    
    # Get scores and sort them
    scores = selector.scores_
    indices = np.argsort(scores)[::-1]
    
    # Plot the top 30 features
    plt.figure(figsize=(12, 8))
    plt.bar(range(30), scores[indices][:30], align='center')
    plt.xticks(range(30), features.columns[indices][:30], rotation=90)
    plt.title(title, fontsize=16)
    plt.xlabel('Features')
    plt.ylabel('Score')
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Univariate Feature Importance plot saved to {output_file}")

def rfe_ranking_plot(features, target, output_file='rfe_ranking.png'):
    """Generate and save Recursive Feature Elimination Ranking plot."""
    # Initialize the RF classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Create RFE object and rank features
    rfe = RFE(estimator=rf, n_features_to_select=10, step=1)
    rfe.fit(features, target)
    
    # Get feature ranking
    ranking = rfe.ranking_
    
    # Sort features by ranking
    indices = np.argsort(ranking)
    
    # Plot the top 30 features with lowest rank (most important)
    plt.figure(figsize=(12, 8))
    plt.barh(range(30), [ranking[i] for i in indices[:30]])
    plt.yticks(range(30), features.columns[indices][:30])
    plt.title('Recursive Feature Elimination Ranking', fontsize=16)
    plt.xlabel('Rank (lower is better)')
    plt.ylabel('Features')
    plt.gca().invert_xaxis()  # Invert x-axis to show best features on top
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"RFE Ranking plot saved to {output_file}")

def lasso_coefficient_plot(features, target, output_file='lasso_coefficients.png'):
    """Generate and save LASSO Coefficient plot."""
    # Convert target to numerical values for regression
    target_encoder = {label: i for i, label in enumerate(target.unique())}
    y_numeric = target.map(target_encoder)
    
    # Fit LASSO model
    lasso = Lasso(alpha=0.01)
    lasso.fit(features, y_numeric)
    
    # Get coefficients and sort by absolute value
    coefs = lasso.coef_
    indices = np.argsort(np.abs(coefs))[::-1]
    
    # Plot the top 30 features with highest absolute coefficient
    plt.figure(figsize=(12, 8))
    plt.bar(range(30), coefs[indices][:30], align='center')
    plt.xticks(range(30), features.columns[indices][:30], rotation=90)
    plt.title('LASSO Coefficients (Alpha=0.01)', fontsize=16)
    plt.xlabel('Features')
    plt.ylabel('Coefficient Value')
    plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"LASSO Coefficient plot saved to {output_file}")

def run_all_analyses(file_path='all_features.csv'):
    """Run all feature analysis and generate reports."""
    print("Starting comprehensive feature analysis...")
    
    # Load the data
    df = load_data(file_path)
    if df is None:
        return
    
    # Preprocess the data
    features, target = preprocess_data(df)
    
    # Generate all reports
    print("\nGenerating reports...\n")
    
    generate_correlation_heatmap(features)
    random_forest_importance(features, target)
    generate_pca_plots(features, target)
    permutation_importance_plot(features, target)
    univariate_feature_importance(features, target, method='f_test')
    univariate_feature_importance(features, target, method='mutual_info', 
                               output_file='mutual_info_importance.png')
    rfe_ranking_plot(features, target)
    lasso_coefficient_plot(features, target)
    
    print("\nAll analyses completed successfully!")

if __name__ == "__main__":
    run_all_analyses()

Starting comprehensive feature analysis...
Successfully loaded data with 283 rows and 109 columns.
Data preprocessed: 107 features available.

Generating reports...

Correlation heatmap saved to correlation_heatmap.png
Random Forest Feature Importance plot saved to random_forest_importance.png
PCA 2D scatter plot saved to pca_2d_scatter.png


KeyboardInterrupt: 