# Advanced Feature Selection for HAI-21.03 Dataset (NaN-Handling Version)

This notebook implements advanced feature selection techniques for the HAI-21.03 industrial control system security dataset with proper handling of NaN values.

In [1]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style="darkgrid")

# Set random seeds for reproducibility
np.random.seed(42)

## 1. Global Variables Setup

In [2]:
# Set paths - FIXED PATHS to match the directory structure
OUTPUT_DIR = '../hai-security-dataset/processed'
FEATURE_DIR = '../hai-security-dataset/features'

# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FEATURE_DIR, exist_ok=True)

# Print current working directory to verify path
print(f"Current working directory: {os.getcwd()}")
print(f"Output directory: {os.path.abspath(OUTPUT_DIR)}")
print(f"Feature directory: {os.path.abspath(FEATURE_DIR)}")

# List files in directories
print("\nFiles in output directory:")
if os.path.exists(OUTPUT_DIR):
    files = os.listdir(OUTPUT_DIR)
    for file in files:
        print(f"  {file}")
else:
    print(f"  Directory {OUTPUT_DIR} does not exist")

print("\nFiles in feature directory:")
if os.path.exists(FEATURE_DIR):
    files = os.listdir(FEATURE_DIR)
    if files:
        for file in files:
            print(f"  {file}")
    else:
        print("  No files found (directory is empty)")
else:
    print(f"  Directory {FEATURE_DIR} does not exist")

Current working directory: c:\Users\User\WebstormProjects\hai-dataset-analysis\exp2
Output directory: c:\Users\User\WebstormProjects\hai-dataset-analysis\hai-security-dataset\processed
Feature directory: c:\Users\User\WebstormProjects\hai-dataset-analysis\hai-security-dataset\features

Files in output directory:
  enhanced_graph_v2.pkl
  feature_info.pkl
  merged_train.csv
  pca.pkl
  scaler.pkl
  test1.parquet
  test1_processed.csv
  test1_processed_enhanced_v2.csv
  test2.parquet
  test2_processed.csv
  test3.parquet
  test3_processed.csv
  test4.parquet
  test4_processed.csv
  test5.parquet
  test5_processed.csv
  train1.parquet
  train1_processed_enhanced_v2.csv
  train2.parquet
  train3.parquet
  train_processed.csv

Files in feature directory:
  feature_scaler.pkl


## 2. Load Enhanced Data

In [3]:
# Look for processed files
train_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('train') and f.endswith('_processed_enhanced_v2.csv')]
if not train_files:
    train_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('train') and f.endswith('_processed.csv')]

test_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('test') and f.endswith('_processed_enhanced_v2.csv')]
if not test_files:
    test_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('test') and f.endswith('_processed.csv')]

print(f"Found {len(train_files)} train files and {len(test_files)} test files")

Found 1 train files and 1 test files


In [4]:
# Load the first train file
if train_files:
    print(f"Loading {train_files[0]}...")
    train_df = pd.read_csv(os.path.join(OUTPUT_DIR, train_files[0]))
    print(f"Loaded {train_files[0]}: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
else:
    raise FileNotFoundError("No processed training data found. Please run the feature engineering notebook first.")

Loading train1_processed_enhanced_v2.csv...
Loaded train1_processed_enhanced_v2.csv: 216001 rows, 822 columns


In [5]:
# Load test data
test_data = {}
if test_files:
    print(f"Loading test files...")
    for file in test_files[:1]:  # Load only the first test file to save memory
        file_path = os.path.join(OUTPUT_DIR, file)
        file_name = file.split('_')[0]  # Extract test file name (e.g., 'test1')
        df = pd.read_csv(file_path)
        test_data[file_name] = df
        print(f"Loaded {file_name}: {df.shape[0]} rows, {df.shape[1]} columns")
else:
    print("No processed test data found. Will only work with training data.")

Loading test files...
Loaded test1: 43201 rows, 788 columns


## 3. Feature Filtering and NaN Handling

In [6]:
def filter_features(df):
    """
    Filter out non-feature columns and handle problematic features.
    
    Args:
        df (pd.DataFrame): Input dataframe
        
    Returns:
        tuple: (filtered_df, feature_cols, target_col)
    """
    # Exclude non-numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    
    # Exclude time and attack columns from features
    exclude_cols = [col for col in df.columns if col.startswith('time')]
    target_col = 'attack'
    
    # Get feature columns
    feature_cols = [col for col in numeric_cols if col not in exclude_cols and col != target_col]
    
    # Check for constant or near-constant features
    variance = df[feature_cols].var()
    constant_features = variance[variance < 1e-10].index.tolist()
    if constant_features:
        print(f"Removing {len(constant_features)} constant or near-constant features")
        feature_cols = [col for col in feature_cols if col not in constant_features]
    
    # Check for features with too many NaN values
    nan_percentage = df[feature_cols].isna().mean()
    high_nan_features = nan_percentage[nan_percentage > 0.1].index.tolist()
    if high_nan_features:
        print(f"Removing {len(high_nan_features)} features with >10% NaN values")
        feature_cols = [col for col in feature_cols if col not in high_nan_features]
    
    # Check for features with infinite values
    inf_features = []
    for col in feature_cols:
        if np.isinf(df[col]).any():
            inf_features.append(col)
    
    if inf_features:
        print(f"Removing {len(inf_features)} features with infinite values")
        feature_cols = [col for col in feature_cols if col not in inf_features]
    
    print(f"Selected {len(feature_cols)} valid features out of {len(numeric_cols)} numeric columns")
    
    # Create a filtered dataframe with selected features and target
    filtered_df = df[feature_cols + [target_col]].copy()
    
    # Check for any remaining NaN values
    nan_count = filtered_df[feature_cols].isna().sum().sum()
    if nan_count > 0:
        print(f"Found {nan_count} remaining NaN values. Filling with median values.")
        # Fill remaining NaN values with median for each column
        for col in feature_cols:
            if filtered_df[col].isna().any():
                median_val = filtered_df[col].median()
                filtered_df[col] = filtered_df[col].fillna(median_val)
    
    # Verify no NaN values remain
    nan_count_after = filtered_df[feature_cols].isna().sum().sum()
    if nan_count_after > 0:
        print(f"Warning: {nan_count_after} NaN values still remain after filling!")
    else:
        print("All NaN values have been handled successfully.")
    
    return filtered_df, feature_cols, target_col

In [7]:
# Filter features and handle NaN values
filtered_train_df, feature_cols, target_col = filter_features(train_df)

# Extract features and target
X_train = filtered_train_df[feature_cols].values
y_train = filtered_train_df[target_col].values

print(f"Training data shape: X={X_train.shape}, y={y_train.shape}")

# Double-check for NaN values
nan_count = np.isnan(X_train).sum()
if nan_count > 0:
    print(f"Warning: Found {nan_count} NaN values in X_train. Using SimpleImputer to handle them.")
    # Use SimpleImputer to handle any remaining NaN values
    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)
    print(f"After imputation, NaN count: {np.isnan(X_train).sum()}")
else:
    print("No NaN values found in X_train.")

Removing 53 constant or near-constant features
Removing 143 features with >10% NaN values
Selected 624 valid features out of 821 numeric columns
Found 1170000 remaining NaN values. Filling with median values.
All NaN values have been handled successfully.
Training data shape: X=(216001, 624), y=(216001,)
No NaN values found in X_train.


## 4. Feature Scaling

In [8]:
# Apply RobustScaler (good for anomaly detection)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Check for NaN or infinite values after scaling
nan_count = np.isnan(X_train_scaled).sum()
inf_count = np.isinf(X_train_scaled).sum()

if nan_count > 0 or inf_count > 0:
    print(f"Warning: Found {nan_count} NaN values and {inf_count} infinite values after scaling.")
    print("Replacing NaN and infinite values with 0...")
    X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0, posinf=0.0, neginf=0.0)
    print(f"After replacement, NaN count: {np.isnan(X_train_scaled).sum()}, Inf count: {np.isinf(X_train_scaled).sum()}")
else:
    print("No NaN or infinite values found after scaling.")

# Save the scaler
with open(os.path.join(FEATURE_DIR, 'feature_scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)
print(f"Saved scaler to {os.path.join(FEATURE_DIR, 'feature_scaler.pkl')}")

No NaN or infinite values found after scaling.
Saved scaler to ../hai-security-dataset/features\feature_scaler.pkl


## 5. Filter-Based Feature Selection

In [9]:
def apply_filter_methods(X_train_scaled, y_train, feature_cols, k=50):
    """
    Apply filter-based feature selection methods with error handling.
    """
    # Verify no NaN values in input data
    if np.isnan(X_train_scaled).any() or np.isnan(y_train).any():
        print("Warning: NaN values detected in input data. Replacing with zeros.")
        X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0)
        y_train = np.nan_to_num(y_train, nan=0.0)
    
    results = {}
    
    # 1. Variance Threshold
    print("Applying Variance Threshold...")
    selector = VarianceThreshold(threshold=0.01)
    X_var = selector.fit_transform(X_train_scaled)
    var_support = selector.get_support()
    var_features = [feature_cols[i] for i in range(len(feature_cols)) if var_support[i]]
    results['variance_threshold'] = {
        'support': var_support,
        'features': var_features,
        'scores': selector.variances_,
        'data': X_var
    }
    print(f"  Selected {len(var_features)} features")
    
    # 2. ANOVA F-value
    print("Applying ANOVA F-value...")
    try:
        selector = SelectKBest(f_classif, k=min(k, X_train_scaled.shape[1]))
        X_anova = selector.fit_transform(X_train_scaled, y_train)
        anova_support = selector.get_support()
        anova_features = [feature_cols[i] for i in range(len(feature_cols)) if anova_support[i]]
        results['anova'] = {
            'support': anova_support,
            'features': anova_features,
            'scores': selector.scores_,
            'data': X_anova
        }
        print(f"  Selected {len(anova_features)} features")
    except Exception as e:
        print(f"  Error applying ANOVA F-value: {e}")
        print("  Using RandomForest feature importance as a fallback")
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        estimator.fit(X_train_scaled, y_train)
        rf_importances = estimator.feature_importances_
        rf_indices = np.argsort(rf_importances)[::-1][:k]
        anova_features = [feature_cols[i] for i in rf_indices]
        anova_support = np.zeros(len(feature_cols), dtype=bool)
        anova_support[rf_indices] = True
        results['anova'] = {
            'support': anova_support,
            'features': anova_features,
            'scores': rf_importances,
            'data': X_train_scaled[:, rf_indices]
        }
        print(f"  Selected {len(anova_features)} features using RandomForest")
    
    # 3. Mutual Information
    print("Applying Mutual Information...")
    try:
        selector = SelectKBest(mutual_info_classif, k=min(k, X_train_scaled.shape[1]))
        X_mi = selector.fit_transform(X_train_scaled, y_train)
        mi_support = selector.get_support()
        mi_features = [feature_cols[i] for i in range(len(feature_cols)) if mi_support[i]]
        results['mutual_info'] = {
            'support': mi_support,
            'features': mi_features,
            'scores': selector.scores_,
            'data': X_mi
        }
        print(f"  Selected {len(mi_features)} features")
    except Exception as e:
        print(f"  Error applying Mutual Information: {e}")
        print("  Using GradientBoosting feature importance as a fallback")
        estimator = GradientBoostingClassifier(n_estimators=100, random_state=42)
        estimator.fit(X_train_scaled, y_train)
        gb_importances = estimator.feature_importances_
        gb_indices = np.argsort(gb_importances)[::-1][:k]
        mi_features = [feature_cols[i] for i in gb_indices]
        mi_support = np.zeros(len(feature_cols), dtype=bool)
        mi_support[gb_indices] = True
        results['mutual_info'] = {
            'support': mi_support,
            'features': mi_features,
            'scores': gb_importances,
            'data': X_train_scaled[:, gb_indices]
        }
        print(f"  Selected {len(mi_features)} features using GradientBoosting")
    
    return results

In [10]:
# Apply filter-based methods
filter_results = apply_filter_methods(X_train_scaled, y_train, feature_cols, k=50)

# Save filter results
with open(os.path.join(FEATURE_DIR, 'filter_results.pkl'), 'wb') as f:
    pickle.dump(filter_results, f)
print(f"Saved filter results to {os.path.join(FEATURE_DIR, 'filter_results.pkl')}")

Applying Variance Threshold...
  Selected 618 features
Applying ANOVA F-value...
  Selected 50 features
Applying Mutual Information...
  Selected 50 features
Saved filter results to ../hai-security-dataset/features\filter_results.pkl


## 6. Wrapper-Based Feature Selection

In [11]:
def apply_wrapper_methods(X_train_scaled, y_train, feature_cols):
    """
    Apply wrapper-based feature selection methods with error handling.
    """
    # Verify no NaN values in input data
    if np.isnan(X_train_scaled).any() or np.isnan(y_train).any():
        print("Warning: NaN values detected in input data. Replacing with zeros.")
        X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0)
        y_train = np.nan_to_num(y_train, nan=0.0)
    
    results = {}
    
    # 1. Recursive Feature Elimination (RFE)
    print("Applying Recursive Feature Elimination...")
    try:
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        selector = RFE(estimator, n_features_to_select=min(50, X_train_scaled.shape[1]), step=0.1)
        X_rfe = selector.fit_transform(X_train_scaled, y_train)
        rfe_support = selector.get_support()
        rfe_features = [feature_cols[i] for i in range(len(feature_cols)) if rfe_support[i]]
        results['rfe'] = {
            'support': rfe_support,
            'features': rfe_features,
            'ranking': selector.ranking_,
            'data': X_rfe
        }
        print(f"  Selected {len(rfe_features)} features")
    except Exception as e:
        print(f"  Error applying RFE: {e}")
        print("  Using RandomForest feature importance directly")
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        estimator.fit(X_train_scaled, y_train)
        rf_importances = estimator.feature_importances_
        rf_indices = np.argsort(rf_importances)[::-1][:50]
        rfe_features = [feature_cols[i] for i in rf_indices]
        rfe_support = np.zeros(len(feature_cols), dtype=bool)
        rfe_support[rf_indices] = True
        rfe_ranking = np.ones(len(feature_cols)) * 100  # High rank for non-selected features
        rfe_ranking[rf_indices] = np.arange(1, len(rf_indices) + 1)  # Rank selected features
        results['rfe'] = {
            'support': rfe_support,
            'features': rfe_features,
            'ranking': rfe_ranking,
            'data': X_train_scaled[:, rf_indices]
        }
        print(f"  Selected {len(rfe_features)} features using RandomForest importance")
    
    # 2. Random Forest Feature Importance
    print("Applying Random Forest Feature Importance...")
    estimator = RandomForestClassifier(n_estimators=100, random_state=42)
    estimator.fit(X_train_scaled, y_train)
    rf_importances = estimator.feature_importances_
    rf_indices = np.argsort(rf_importances)[::-1]
    rf_features = [feature_cols[i] for i in rf_indices[:50]]
    results['random_forest'] = {
        'features': rf_features,
        'importances': rf_importances,
        'indices': rf_indices
    }
    print(f"  Selected {len(rf_features)} features")
    
    # 3. Gradient Boosting Feature Importance
    print("Applying Gradient Boosting Feature Importance...")
    try:
        estimator = GradientBoostingClassifier(n_estimators=100, random_state=42)
        estimator.fit(X_train_scaled, y_train)
        gb_importances = estimator.feature_importances_
        gb_indices = np.argsort(gb_importances)[::-1]
        gb_features = [feature_cols[i] for i in gb_indices[:50]]
        results['gradient_boosting'] = {
            'features': gb_features,
            'importances': gb_importances,
            'indices': gb_indices
        }
        print(f"  Selected {len(gb_features)} features")
    except Exception as e:
        print(f"  Error applying Gradient Boosting: {e}")
        print("  Using Random Forest results as a fallback")
        results['gradient_boosting'] = results['random_forest']
        print(f"  Selected {len(rf_features)} features using Random Forest")
    
    return results

In [12]:
# Apply wrapper-based methods
wrapper_results = apply_wrapper_methods(X_train_scaled, y_train, feature_cols)

# Save wrapper results
with open(os.path.join(FEATURE_DIR, 'wrapper_results.pkl'), 'wb') as f:
    pickle.dump(wrapper_results, f)
print(f"Saved wrapper results to {os.path.join(FEATURE_DIR, 'wrapper_results.pkl')}")

Applying Recursive Feature Elimination...
  Selected 50 features
Applying Random Forest Feature Importance...
  Selected 50 features
Applying Gradient Boosting Feature Importance...
  Error applying Gradient Boosting: y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.
  Using Random Forest results as a fallback
  Selected 50 features using Random Forest
Saved wrapper results to ../hai-security-dataset/features\wrapper_results.pkl


## 7. Embedded-Based Feature Selection

In [13]:
def apply_embedded_methods(X_train_scaled, y_train, feature_cols):
    """
    Apply embedded feature selection methods with error handling.
    """
    # Verify no NaN values in input data
    if np.isnan(X_train_scaled).any() or np.isnan(y_train).any():
        print("Warning: NaN values detected in input data. Replacing with zeros.")
        X_train_scaled = np.nan_to_num(X_train_scaled, nan=0.0)
        y_train = np.nan_to_num(y_train, nan=0.0)
    
    results = {}
    
    # 1. Lasso
    print("Applying Lasso...")
    try:
        lasso = Lasso(alpha=0.01)
        lasso.fit(X_train_scaled, y_train)
        lasso_coef = np.abs(lasso.coef_)
        lasso_indices = np.argsort(lasso_coef)[::-1]
        lasso_features = [feature_cols[i] for i in lasso_indices[:50]]
        results['lasso'] = {
            'features': lasso_features,
            'coefficients': lasso_coef,
            'indices': lasso_indices
        }
        print(f"  Selected {len(lasso_features)} features")
    except Exception as e:
        print(f"  Error applying Lasso: {e}")
        print("  Using Random Forest feature importance as a fallback")
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        estimator.fit(X_train_scaled, y_train)
        rf_importances = estimator.feature_importances_
        rf_indices = np.argsort(rf_importances)[::-1][:50]
        lasso_features = [feature_cols[i] for i in rf_indices]
        results['lasso'] = {
            'features': lasso_features,
            'coefficients': rf_importances,
            'indices': rf_indices
        }
        print(f"  Selected {len(lasso_features)} features using Random Forest")
    
    # 2. SelectFromModel with Random Forest
    print("Applying SelectFromModel with Random Forest...")
    try:
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        selector = SelectFromModel(estimator, threshold='median')
        X_sfm = selector.fit_transform(X_train_scaled, y_train)
        sfm_support = selector.get_support()
        sfm_features = [feature_cols[i] for i in range(len(feature_cols)) if sfm_support[i]]
        results['select_from_model'] = {
            'support': sfm_support,
            'features': sfm_features,
            'data': X_sfm
        }
        print(f"  Selected {len(sfm_features)} features")
    except Exception as e:
        print(f"  Error applying SelectFromModel: {e}")
        print("  Using top 50 features from Random Forest importance directly")
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        estimator.fit(X_train_scaled, y_train)
        rf_importances = estimator.feature_importances_
        rf_indices = np.argsort(rf_importances)[::-1][:50]
        sfm_features = [feature_cols[i] for i in rf_indices]
        sfm_support = np.zeros(len(feature_cols), dtype=bool)
        sfm_support[rf_indices] = True
        results['select_from_model'] = {
            'support': sfm_support,
            'features': sfm_features,
            'data': X_train_scaled[:, rf_indices]
        }
        print(f"  Selected {len(sfm_features)} features using Random Forest importance")
    
    return results

In [14]:
# Apply embedded-based methods
embedded_results = apply_embedded_methods(X_train_scaled, y_train, feature_cols)

# Save embedded results
with open(os.path.join(FEATURE_DIR, 'embedded_results.pkl'), 'wb') as f:
    pickle.dump(embedded_results, f)
print(f"Saved embedded results to {os.path.join(FEATURE_DIR, 'embedded_results.pkl')}")

Applying Lasso...
  Selected 50 features
Applying SelectFromModel with Random Forest...
  Selected 624 features
Saved embedded results to ../hai-security-dataset/features\embedded_results.pkl


## 8. Feature Ranking and Ensemble Selection

In [15]:
def ensemble_feature_selection(filter_results, wrapper_results, embedded_results, feature_cols):
    """
    Combine results from different feature selection methods.
    """
    # Initialize feature scores
    feature_scores = {feature: 0 for feature in feature_cols}
    
    # Collect all selected features
    all_selected_features = []
    
    # Add filter-based features
    all_selected_features.extend(filter_results['anova']['features'])
    all_selected_features.extend(filter_results['mutual_info']['features'])
    
    # Add wrapper-based features
    all_selected_features.extend(wrapper_results['rfe']['features'])
    all_selected_features.extend(wrapper_results['random_forest']['features'])
    all_selected_features.extend(wrapper_results['gradient_boosting']['features'])
    
    # Add embedded-based features
    all_selected_features.extend(embedded_results['lasso']['features'])
    all_selected_features.extend(embedded_results['select_from_model']['features'])
    
    # Count occurrences of each feature
    for feature in all_selected_features:
        if feature in feature_scores:  # Ensure feature exists in the dictionary
            feature_scores[feature] += 1
    
    # Sort features by score
    sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Select top features
    top_n = 50
    ensemble_features = [feature for feature, score in sorted_features[:top_n]]
    
    return ensemble_features, feature_scores

In [16]:
# Apply ensemble feature selection
ensemble_features, feature_scores = ensemble_feature_selection(
    filter_results, wrapper_results, embedded_results, feature_cols
)

print(f"Selected {len(ensemble_features)} features using ensemble method")

# Save ensemble features
with open(os.path.join(FEATURE_DIR, 'ensemble_features.pkl'), 'wb') as f:
    pickle.dump({
        'ensemble_features': ensemble_features,
        'feature_scores': feature_scores
    }, f)
print(f"Saved ensemble features to {os.path.join(FEATURE_DIR, 'ensemble_features.pkl')}")

# Save feature names to text file
with open(os.path.join(FEATURE_DIR, 'selected_features.txt'), 'w') as f:
    for feature in ensemble_features:
        f.write(f"{feature}\n")
print(f"Saved selected feature names to {os.path.join(FEATURE_DIR, 'selected_features.txt')}")

Selected 50 features using ensemble method
Saved ensemble features to ../hai-security-dataset/features\ensemble_features.pkl
Saved selected feature names to ../hai-security-dataset/features\selected_features.txt


## 9. Dimensionality Reduction

In [17]:
def apply_dimensionality_reduction(X_train_scaled, ensemble_features, feature_cols):
    """
    Apply dimensionality reduction techniques with error handling.
    """
    # Get indices of ensemble features
    ensemble_indices = [feature_cols.index(feature) for feature in ensemble_features if feature in feature_cols]
    X_ensemble = X_train_scaled[:, ensemble_indices]
    
    # Check for NaN values
    if np.isnan(X_ensemble).any():
        print("Warning: NaN values detected in ensemble features. Replacing with zeros.")
        X_ensemble = np.nan_to_num(X_ensemble, nan=0.0)
    
    results = {}
    
    # 1. PCA
    print("Applying PCA...")
    try:
        n_components = min(20, X_ensemble.shape[1])
        pca = PCA(n_components=n_components)
        X_pca = pca.fit_transform(X_ensemble)
        explained_variance = np.sum(pca.explained_variance_ratio_)
        results['pca'] = {
            'data': X_pca,
            'transformer': pca,
            'explained_variance': explained_variance
        }
        print(f"  Reduced to {n_components} components with {explained_variance:.4f} explained variance")
    except Exception as e:
        print(f"  Error applying PCA: {e}")
        print("  Skipping PCA")
        results['pca'] = {
            'data': X_ensemble,
            'transformer': None,
            'explained_variance': 0.0
        }
    
    # 2. Kernel PCA
    print("Applying Kernel PCA...")
    try:
        kpca = KernelPCA(n_components=n_components, kernel='rbf')
        X_kpca = kpca.fit_transform(X_ensemble)
        results['kpca'] = {
            'data': X_kpca,
            'transformer': kpca
        }
        print(f"  Reduced to {n_components} components")
    except Exception as e:
        print(f"  Error applying Kernel PCA: {e}")
        print("  Skipping Kernel PCA")
        results['kpca'] = {
            'data': X_ensemble,
            'transformer': None
        }
    
    return results, ensemble_indices

In [18]:
# Apply dimensionality reduction
reduction_results, ensemble_indices = apply_dimensionality_reduction(
    X_train_scaled, ensemble_features, feature_cols
)

# Save dimensionality reduction results
with open(os.path.join(FEATURE_DIR, 'reduction_results.pkl'), 'wb') as f:
    pickle.dump({
        'pca': reduction_results['pca']['transformer'],
        'kpca': reduction_results['kpca']['transformer'],
        'ensemble_indices': ensemble_indices
    }, f)
print(f"Saved dimensionality reduction results to {os.path.join(FEATURE_DIR, 'reduction_results.pkl')}")

Applying PCA...
  Reduced to 20 components with 0.9992 explained variance
Applying Kernel PCA...
  Error applying Kernel PCA: Unable to allocate 348. GiB for an array with shape (216001, 216001) and data type float64
  Skipping Kernel PCA
Saved dimensionality reduction results to ../hai-security-dataset/features\reduction_results.pkl


## 10. Save Feature Selection Results

In [19]:
# Create a comprehensive feature selection results dictionary
feature_selection_results = {
    'ensemble_features': ensemble_features,
    'feature_scores': feature_scores,
    'ensemble_indices': ensemble_indices,
    'scaler': scaler,
    'pca': reduction_results['pca']['transformer'],
    'kpca': reduction_results['kpca']['transformer']
}

# Save comprehensive results
with open(os.path.join(FEATURE_DIR, 'feature_selection_results.pkl'), 'wb') as f:
    pickle.dump(feature_selection_results, f)

print(f"Saved comprehensive feature selection results to {os.path.join(FEATURE_DIR, 'feature_selection_results.pkl')}")

# List all files in the feature directory to confirm they were created
print("\nFiles in feature directory:")
if os.path.exists(FEATURE_DIR):
    files = os.listdir(FEATURE_DIR)
    if files:
        for file in files:
            print(f"  {file}")
    else:
        print("  No files found (directory is empty)")
else:
    print(f"  Directory {FEATURE_DIR} does not exist")

Saved comprehensive feature selection results to ../hai-security-dataset/features\feature_selection_results.pkl

Files in feature directory:
  embedded_results.pkl
  ensemble_features.pkl
  feature_scaler.pkl
  feature_selection_results.pkl
  filter_results.pkl
  reduction_results.pkl
  selected_features.txt
  wrapper_results.pkl
