# Feature Selection on DOROTHEA

### Steps in the code
1) Importing the dataset and loading all relevant packages, grouped by their function in our code.
2) Exploratory data analysis, producing Figures 1 and 2 for our report.
3) Embedded methods and filters for Section 2 of our report.
4) Line chart to illustrate the results from the previous step.
5) Forward Stepwise Selection and Backward Stepwise Selection.
6) FSS / BSS plot (this plot is omitted from the report).
7) Mutual Information + Lasso + SVM
8) Optimized Random Forest

In [None]:
# 1) Preliminaries
# Import necessary libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import time
import ast
import warnings
from tqdm import tqdm
from pympler.asizeof import asizeof
from brokenaxes import brokenaxes

# Data handling and splitting
from scipy.sparse import csr_matrix
from sklearn.model_selection import (
    StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
)
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, leaves_list

# Feature selection
from sklearn.feature_selection import (
    RFE, SelectKBest, SelectFromModel, SequentialFeatureSelector,
    mutual_info_classif, VarianceThreshold, f_classif, chi2
)
from skrebate import ReliefF
from mrmr import mrmr_classif

# Model training and evaluation
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import balanced_accuracy_score

#Loading data from repository
data_raw = pd.read_csv("data2.csv.gz")

In [None]:
# 2) Exploratory data analysis
#We plot two figures to represent the data. Additionally, we calculate a few metric that describe the data, its size and its properties.

# Figure 1: Class imbalance with percentages inside bars
label_counts = data_raw['label'].value_counts()
total_count = label_counts.sum()

plt.figure(figsize=(6, 4))
bar_positions = [0.25, 0.75]
plt.bar(bar_positions, label_counts, color=['lightgrey', 'darkgrey'], edgecolor="black", width=0.4)
for pos, count in zip(bar_positions, label_counts):
    plt.text(pos, count - total_count * 0.05, f"{(count / total_count) * 100:.1f}%", 
             ha='center', va='center', fontsize=10, color='black')
plt.xlabel("Compound Activity")
plt.ylabel("Number of Compounds")
plt.xticks(bar_positions, ["Inactive (-1)", "Active (1)"])
plt.xlim(0, 1)
plt.savefig("Plots/figure_1.png", format="png", dpi=300, bbox_inches="tight")
plt.show()


# Figure 2: Distribution of means of all features
plt.figure(figsize=(6, 4))
column_means = data_raw.mean()
proportion_ones = (data_raw['label'] == 1).mean()

plt.hist(column_means, bins=500, color='lightgrey', edgecolor='darkgrey')
plt.axvline(proportion_ones, color='black', linestyle='--', linewidth=1)
plt.text(proportion_ones - 0.01, plt.gca().get_ylim()[1] * 0.8, 
         f"Proportion of positive class\nin target: {proportion_ones:.3f}", 
         color='black', fontsize=10, ha='right', va='center')
plt.xlabel("Mean Value")
plt.ylabel("Frequency")
plt.xlim(0, 0.15)
plt.savefig("Plots/figure_2.png", format="png", dpi=300, bbox_inches="tight")
plt.show()

#Sparsity score: number of non-zero elements / total number of elements in matrix
print(f"Data has a sparsity score of: {round(1 - csr_matrix(data_raw).nnz/(csr_matrix(data_raw).shape[0] * csr_matrix(data_raw).shape[1]), 3)*100}%.")

#Size of dataset: comparison between sparse and dense representation
print(f"Dense representation is {round(asizeof(data_raw)/asizeof(csr_matrix(data_raw)))} times larger in memory.")


In [None]:
# 3) Testing filters and embedded methods with 10-fold cross-validation and separate test set

#Defining values for number of features to select
values = np.concatenate([
    np.arange(1, 21, 1),        # Steps of 1 from 1 to 20
    np.arange(20, 101, 20),     # Steps of 20 from 20 to 100
    np.arange(100, 501, 50),    # Steps of 50 from 100 to 500
    np.arange(500, 1001, 100)   # Steps of 100 from 500 to 1000
])
values_list = list(values)

# Initialize results storage
results = []

# Define hyperparameter ranges for each method
hyperparameters = {
    "VarianceThreshold": {"threshold": np.linspace(0.05, 0.2, 50)},
    "Correlation (f_classif)": {"k": values_list},
    #"Mutual Information": {"k": values_list},
    "Chi-Square": {"k": values_list},
    "RandomForest": {"threshold": np.union1d(np.linspace(1e-7, 1e-3, 50), np.logspace(-3, -2, 10, base=10))},
    "Lasso": {"C": sorted(set(np.arange(0.01, 0.3, 0.01)).union({0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 3, 5, 10, 20, 50, 100, 1000}))},
    #"ReliefF": {"n_features_to_select": values_list},
    #"ElasticNet": {"C": sorted(set(np.arange(0.01, 0.3, 0.01)).union({0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 3, 5, 10, 20, 50, 100, 1000}))},
    #"mRMR": {"k": values_list}
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=25)

# Convert X and y to arrays
X = data_raw.iloc[:, 1:].values
y = data_raw['label']

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Start hyperparameter search for each filter method
for method_name, params in hyperparameters.items():
    for param, values in params.items():
        for value in tqdm(values, desc=f"{method_name} Progress ({param})"):
            try:
                start_time = time.time()
                balanced_accuracies = []  # Store CV accuracies

                # Perform cross-validation
                for train_idx, test_idx in cv.split(X_train, y_train):
                    # Split the data
                    X_train_fold, X_test_fold = X_train[train_idx], X_train[test_idx]
                    y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

                    # Initialize filter method with current hyperparameter
                    if method_name == "VarianceThreshold":
                        method = VarianceThreshold(threshold=value)
                        method.fit(X_train_fold.toarray())
                        selected_indices = np.where(method.get_support())[0]
                    elif method_name == "Correlation (f_classif)":
                        method = SelectKBest(score_func=f_classif, k=value)
                        method.fit(X_train_fold.toarray(), y_train_fold)
                        selected_indices = method.get_support(indices=True)
                    elif method_name == "Mutual Information":
                        method = SelectKBest(score_func=mutual_info_classif, k=value)
                        method.fit(X_train_fold.toarray(), y_train_fold)
                        selected_indices = method.get_support(indices=True)
                    elif method_name == "Chi-Square":
                        method = SelectKBest(score_func=chi2, k=value)
                        method.fit(X_train_fold.toarray(), y_train_fold)
                        selected_indices = method.get_support(indices=True)
                    elif method_name == "ReliefF":
                        chi2_filter = SelectKBest(score_func=chi2, k=1000)
                        X_chi2_train = chi2_filter.fit_transform(X_train_fold.toarray(), y_train_fold)
                        method = ReliefF(n_features_to_select=value)
                        method.fit(X_chi2_train, y_train_fold)
                        selected_indices = method.top_features_[:value]
                    elif method_name == "RandomForest":
                        method = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold=value)
                        method.fit(X_train_fold.toarray(), y_train_fold)
                        selected_indices = method.get_support(indices=True)
                    elif method_name == "Lasso":
                        lasso = LogisticRegression(penalty='l1', solver='saga', C=value, max_iter=10000, random_state=42)
                        lasso.fit(X_train_fold, y_train_fold)
                        y_pred = lasso.predict(X_test_fold)
                        balanced_accuracies.append(balanced_accuracy_score(y_test_fold, y_pred))
                        selected_indices = np.where(lasso.coef_[0] != 0)[0]
                        continue
                    elif method_name == "ElasticNet":
                        elasticnet = LogisticRegression(penalty='elasticnet', solver='saga', C=value, max_iter=10000, random_state=42, l1_ratio=0.8)
                        elasticnet.fit(X_train_fold, y_train_fold)
                        y_pred = elasticnet.predict(X_test_fold)
                        balanced_accuracies.append(balanced_accuracy_score(y_test_fold, y_pred))
                        selected_indices = np.where(elasticnet.coef_[0] != 0)[0]
                        continue
                    elif method_name == "mRMR":
                        X_train_dense = X_train_fold.toarray()
                        X_train_df = pd.DataFrame(X_train_dense)
                        selected_indices = mrmr_classif(X_train_df, pd.Series(y_train_fold), K=value)

                    # Filter features using the selected indices
                    X_train_filtered = X_train_fold[:, selected_indices]
                    X_test_filtered = X_test_fold[:, selected_indices]

                    # Train logistic regression on filtered features (except for Lasso and ElasticNet)
                    model = LogisticRegression(random_state=42, max_iter=10000)
                    model.fit(X_train_filtered.toarray(), y_train_fold)
                    y_pred = model.predict(X_test_filtered.toarray())
                    balanced_accuracies.append(balanced_accuracy_score(y_test_fold, y_pred))

                # Log results
                mean_balanced_acc = np.mean(balanced_accuracies)
                num_features = len(selected_indices)
                runtime = time.time() - start_time

                results.append({
                    "Method": method_name,
                    "Parameter": param,
                    "Value": value,
                    "Balanced Accuracies (CV)": balanced_accuracies,
                    "Mean Balanced Accuracy": mean_balanced_acc,
                    "Number of Features Selected": num_features,
                    "Selected Features": selected_indices,
                    "Runtime (seconds)": runtime,
                })

                print(f"Mean Balanced Accuracy: {mean_balanced_acc:.4f}, Features: {num_features}")

            except Exception as e:
                print(f"Error with {method_name}, {param}: {value} - {e}")

# Save results to a DataFrame and CSV
results_df = pd.DataFrame(results)
results_df.to_csv("combined_filter_methods_results.csv", index=False)

# Evaluate the best configuration of each method on the test set
final_results = []

# Iterate through methods and their best configurations
for method_name in hyperparameters.keys():
    # Find the best configuration for this method
    method_results = [res for res in results if res["Method"] == method_name]
    if not method_results:
        continue  # Skip if no results for this method
    
    best_result = max(method_results, key=lambda x: x["Mean Balanced Accuracy"])
    best_param = best_result["Parameter"]
    best_value = best_result["Value"]
    selected_indices = best_result["Selected Features"]

    # Filter features for the entire train and test set
    X_train_best = X_train[:, selected_indices]
    X_test_best = X_test[:, selected_indices]

    # Train the final model on the entire training set
    final_model = LogisticRegression(random_state=25, max_iter=10000)
    final_model.fit(X_train_best.toarray(), y_train)

    # Evaluate on the test set
    y_test_pred = final_model.predict(X_test_best.toarray())
    test_balanced_acc = balanced_accuracy_score(y_test, y_test_pred)

    # Log final test results for this method
    final_results.append({
        "Method": method_name,
        "Best Parameter": best_param,
        "Best Value": best_value,
        "Balanced Accuracy on Test Set": test_balanced_acc,
        "Number of Features Selected": len(selected_indices)
    })

    print(f"\n{method_name} - Test Set Results:")
    print(f"Best Parameter: {best_param}")
    print(f"Best Value: {best_value}")
    print(f"Balanced Accuracy on Test Set: {test_balanced_acc:.4f}")
    print(f"Number of Features Selected: {len(selected_indices)}")

# Save final results to a CSV for test evaluation
final_results_df = pd.DataFrame(final_results)
final_results_df.to_csv("test_set_evaluation_results.csv", index=False)

print("\nFinal Test Set Evaluation Results Saved!")


In [None]:
# 4) Linechart for filter and embedded methods

import matplotlib.pyplot as plt
import pandas as pd
import ast

# Load the results
results_df = pd.read_csv("combined_filter_methods_results.csv")

# Extract and compute standard deviation from the "Balanced Accuracies (CV)" column
results_df["Balanced Accuracies (CV)"] = results_df["Balanced Accuracies (CV)"].apply(ast.literal_eval)  # Convert string to list
results_df["Balanced Accuracy Std"] = results_df["Balanced Accuracies (CV)"].apply(lambda x: pd.Series(x).std())
results_df["Mean Balanced Accuracy"] = results_df["Balanced Accuracies (CV)"].apply(lambda x: pd.Series(x).mean())

# Define the data
methods = results_df["Method"]

# Create subplots with defined width ratios
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, facecolor='w',
                              gridspec_kw={'width_ratios': (1, 2), 'wspace': 0.02},
                              figsize=(16, 8))

# Define feature ranges for each subplot
ranges = [(0, 50), (51, 1000)]

# Plot data with mean lines and error bars at highest mean accuracy points
for ax, (xmin, xmax) in zip([ax1, ax2], ranges):
    for method in methods.unique():
        method_data = results_df[results_df["Method"] == method]
        x = method_data["Number of Features Selected"]
        y = method_data["Mean Balanced Accuracy"]
        yerr = method_data["Balanced Accuracy Std"]

        # Plot the mean line
        line, = ax.plot(x, y, label=method, linewidth=2)

        # Find the point with the highest mean accuracy
        max_idx = y.idxmax()
        max_x = x[max_idx]
        max_y = y[max_idx]
        max_yerr = yerr[max_idx]

        # Mark the point with an error bar
        ax.errorbar(
            max_x, max_y, yerr=max_yerr,
            fmt='o', color=line.get_color(), label=None, capsize=5
        )

    ax.set_xlim(xmin, xmax)

# Turn off the axes in the combined plot
ax1.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.tick_params(axis='y', length=0)

# Add labels and title
f.text(0.5, 0.04, "Number of Features Selected", ha='center', fontsize=16)
f.text(0.07, 0.5, "Mean Balanced Accuracy", va='center', rotation='vertical', fontsize=16)

# Move the legend to the bottom left corner inside the plot area
ax2.legend(
    title="Method",
    loc="lower right",
    bbox_to_anchor=(0.999, 0.65),
    fontsize=14,
    title_fontsize=16,
    frameon=True
)

plt.tight_layout()
plt.savefig("Plots/figure_3_2.png", format="png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# 5) Forward and backward selection
#We use forward and backward sequential selection on a pre-filtered subset of the data as the algorithm is otherwise not computationally feasible.

#Prepare training and test set, prepare table to store results
X = data_raw.iloc[:, 1:]
y = data_raw['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

results_table = pd.DataFrame(columns=['Method', 'Balanced Accuracy', 'Number of Features Selected', 'Runtime (seconds)', 'Selected Features'])

#Just Chi2 filter (for comparison)
for i in tqdm(range(10, 100, 10)):
    start_time = time.time()
    
    # Apply Chi2 filter
    chi_selector = SelectKBest(score_func=chi2, k=i)
    X_train_chi_selected = chi_selector.fit_transform(X_train_sparse, y_train)
    X_test_chi_selected = chi_selector.transform(X_test_sparse)
    selected_features = chi_selector.get_support(indices=True)

    # Train and evaluate model
    model = LogisticRegression(random_state=42)
    model.fit(X_train_chi_selected, y_train)
    y_pred = model.predict(X_test_chi_selected)
    runtime = time.time() - start_time
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    # Store results
    results_table = pd.concat([results_table, pd.DataFrame({
        'Method': ['Chi2'],
        'Balanced Accuracy': [balanced_acc],
        'Number of Features Selected': [len(selected_features)],
        'Runtime (seconds)': [runtime],
        'Selected Features': [selected_features.tolist()]
    })], ignore_index=True)

# Variance Threshold, then Forward Stepwise Selection (FSS)
variance_selector = VarianceThreshold(threshold=0.09)
X_train_var_selected = variance_selector.fit_transform(X_train_sparse.toarray())
X_test_var_selected = variance_selector.transform(X_test_sparse)
selected_features = variance_selector.get_support(indices=True)

for i in tqdm(range(10, 100, 10)):
    start_time = time.time()
    
    # Apply Forward Stepwise Selection
    model = LogisticRegression(random_state=42)
    forward_selector = SequentialFeatureSelector(
        model, n_features_to_select=i, direction='forward', scoring='balanced_accuracy', cv=5
    )
    forward_selector.fit(X_train_var_selected, y_train)
    selected_features = np.where(forward_selector.get_support())[0]

    # Transform data
    X_train_forward = forward_selector.transform(X_train_var_selected)
    X_test_forward = forward_selector.transform(X_test_var_selected)

    # Train and evaluate model
    model.fit(X_train_forward, y_train)
    y_pred = model.predict(X_test_forward)
    runtime = time.time() - start_time
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    # Store results
    results_table = pd.concat([results_table, pd.DataFrame({
        'Method': ['FSS'],
        'Balanced Accuracy': [balanced_acc],
        'Number of Features Selected': [len(selected_features)],
        'Runtime (seconds)': [runtime],
        'Selected Features': [selected_features.tolist()]
    })], ignore_index=True)

# Variance Threshold, then Backward Stepwise Selection (BSS)
variance_selector = VarianceThreshold(threshold=0.09)
X_train_var_selected = variance_selector.fit_transform(X_train_sparse.toarray())
X_test_var_selected = variance_selector.transform(X_test_sparse)
selected_features = variance_selector.get_support(indices=True)

for i in tqdm(range(10, 100, 10)):
    start_time = time.time()
    
    # Apply Backward Stepwise Selection
    model = LogisticRegression(random_state=42)
    backward_selector = SequentialFeatureSelector(
        model, n_features_to_select=i, direction='backward', scoring='balanced_accuracy', cv=5
    )
    backward_selector.fit(X_train_var_selected, y_train)
    selected_features = np.where(backward_selector.get_support())[0]

    # Transform data
    X_train_backward = backward_selector.transform(X_train_var_selected)
    X_test_backward = backward_selector.transform(X_test_var_selected)

    # Train and evaluate model
    model.fit(X_train_backward, y_train)
    y_pred = model.predict(X_test_backward)
    runtime = time.time() - start_time
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    # Store results
    results_table = pd.concat([results_table, pd.DataFrame({
        'Method': ['BSS'],
        'Balanced Accuracy': [balanced_acc],
        'Number of Features Selected': [len(selected_features)],
        'Runtime (seconds)': [runtime],
        'Selected Features': [selected_features.tolist()]
    })], ignore_index=True)

# Save results to a CSV file
results_table.to_csv("stepwise_selection_results.csv", index=False)

In [None]:
# 6) (Plotting results of FSS /BSS) omitted from report
df = pd.read_csv("stepwise_selection_results.csv")
method_column = 'Method'
accuracy_column = 'Balanced Accuracy'
features_column = 'Number of Features Selected'

plt.figure(figsize=(10, 6))
for method, group in df.groupby(method_column):
    plt.plot(group[features_column], group[accuracy_column], marker='', label=method)

plt.xlabel('Number of Features Selected')
plt.ylabel('Balanced Accuracy')
plt.title('Figure 4: Performance of Forward and Backward Stepwise Selection')
plt.legend(title='Method')
plt.grid(False)

plt.tight_layout()
plt.savefig("Plots/figure_4.png", format="png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 7) Advanced method 1: MI + Lasso + SVC

# Prepare Data
X = csr_matrix(data_raw.iloc[:, 1:].values)
y = data_raw.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Split training set further into training and validation sets
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

best_balanced_acc = 0
best_params = {}

# Optimize Number of Features for Mutual Information
for k in [50, 100, 200, 500]:
    # Mutual Information Feature Selection
    filter_selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_train_filtered = filter_selector.fit_transform(X_train_, y_train_)
    X_val_filtered = filter_selector.transform(X_val)

    # Lasso Feature Selection
    lasso = LassoCV(
        cv=5, random_state=42, max_iter=10000, tol=1e-4, n_alphas=100
    ).fit(X_train_filtered.toarray(), y_train_)
    selected_mask = lasso.coef_ != 0
    X_train_lasso = X_train_filtered[:, selected_mask]
    X_val_lasso = X_val_filtered[:, selected_mask]

    # SVM Parameter Optimization
    param_grid = {"C": [0.01, 0.1, 1, 10], "kernel": ["linear", "rbf", "poly"]}
    for kernel in param_grid["kernel"]:
        for C in param_grid["C"]:
            svm = SVC(C=C, kernel=kernel, class_weight="balanced", random_state=42)
            svm.fit(X_train_lasso.toarray(), y_train_)
            y_val_pred = svm.predict(X_val_lasso.toarray())
            balanced_acc = balanced_accuracy_score(y_val, y_val_pred)

            if balanced_acc > best_balanced_acc:
                best_balanced_acc = balanced_acc
                best_params = {"k": k, "C": C, "kernel": kernel}

# Log Results
print("Best Balanced Accuracy on Validation Set:", best_balanced_acc)
print("Best Parameters:", best_params)

#-------------------------------------------------------------------------------------

# Final Evaluation on Test Set
# Perform Mutual Information with best number of features
filter_selector = SelectKBest(score_func=mutual_info_classif, k=best_params["k"])
X_train_final_filtered = filter_selector.fit_transform(X_train, y_train)
X_test_final_filtered = filter_selector.transform(X_test)

# Perform Lasso with selected features
lasso = LassoCV(
    cv=5, random_state=42, max_iter=10000, tol=1e-4, n_alphas=100
).fit(X_train_final_filtered.toarray(), y_train)
selected_mask = lasso.coef_ != 0
X_train_final_lasso = X_train_final_filtered[:, selected_mask]
X_test_final_lasso = X_test_final_filtered[:, selected_mask]

# Train the final SVM model with the best parameters
final_svm = SVC(
    C=best_params["C"], kernel=best_params["kernel"], class_weight="balanced", random_state=42
)
final_svm.fit(X_train_final_lasso.toarray(), y_train)

# Evaluate on the test set
y_test_pred = final_svm.predict(X_test_final_lasso.toarray())
balanced_acc_test = balanced_accuracy_score(y_test, y_test_pred)

print("\nFinal Test Set Evaluation:")
print(f"Balanced Accuracy on Test Set: {balanced_acc_test}")
print(f"Number of Selected Features After Lasso: {X_train_final_lasso.shape[1]}")


In [None]:
# 8) Advanced method 2: Optimized random forest

# Prepare Data
X = csr_matrix(data_raw.iloc[:, 1:].values)
y = data_raw.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Split training set further for validation
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

best_balanced_acc = 0
best_params = {}

# Optimize Number of Features for Mutual Information
for k in [50, 100, 200, 500]:
    # Mutual Information Feature Selection
    filter_selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_train_filtered = filter_selector.fit_transform(X_train_, y_train_)
    X_val_filtered = filter_selector.transform(X_val)

    # Random Forest Parameter Optimization
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [10, 20, None],
        "min_samples_split": [2, 5, 10],
    }
    for n_estimators in param_grid["n_estimators"]:
        for max_depth in param_grid["max_depth"]:
            for min_samples_split in param_grid["min_samples_split"]:
                rf_model = RandomForestClassifier(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    min_samples_split=min_samples_split,
                    class_weight="balanced",
                    random_state=42,
                )
                rf_model.fit(X_train_filtered, y_train_)
                y_pred_rf = rf_model.predict(X_val_filtered)
                balanced_acc_rf = balanced_accuracy_score(y_val, y_pred_rf)

                if balanced_acc_rf > best_balanced_acc:
                    best_balanced_acc = balanced_acc_rf
                    best_params = {
                        "k": k,
                        "n_estimators": n_estimators,
                        "max_depth": max_depth,
                        "min_samples_split": min_samples_split,
                    }

# Log Results
print("Best Balanced Accuracy on Validation Set:", best_balanced_acc)
print("Best Parameters:", best_params)

# Evaluate the final model on the test set
filter_selector = SelectKBest(score_func=mutual_info_classif, k=best_params["k"])
X_train_final_filtered = filter_selector.fit_transform(X_train, y_train)
X_test_final_filtered = filter_selector.transform(X_test)

final_rf_model = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    class_weight="balanced",
    random_state=42,
)
final_rf_model.fit(X_train_final_filtered, y_train)

# Compute Feature Importance and Threshold
feature_importances = final_rf_model.feature_importances_
importance_threshold = np.mean(feature_importances)  # Use mean feature importance as threshold
selected_features_mask = feature_importances > importance_threshold

# Transform training and test data
X_train_selected = X_train_final_filtered[:, selected_features_mask]
X_test_selected = X_test_final_filtered[:, selected_features_mask]

print(f"Number of Selected Features After Thresholding: {X_train_selected.shape[1]}")

# Retrain and Evaluate on Test Set
final_rf_model.fit(X_train_selected, y_train)
y_test_pred = final_rf_model.predict(X_test_selected)
balanced_acc_test = balanced_accuracy_score(y_test, y_test_pred)
print("Final Balanced Accuracy on Test Set:", balanced_acc_test)

# Generate Heatmap of Features
# Sort features by importance
sorted_indices = np.argsort(feature_importances[selected_features_mask])[::-1]
features_dense = X_train_selected.toarray()
sorted_features = features_dense[:, sorted_indices]

# Combine labels and sorted features
heatmap_data = pd.DataFrame(sorted_features)
heatmap_data.insert(0, "label", y_train)

# Plot Heatmap
fig, (ax1, ax2) = plt.subplots(1, 2, gridspec_kw={'width_ratios': [0.15, 0.85]}, figsize=(8, 8))
sns.heatmap(heatmap_data[["label"]], cmap='Greys', cbar=False, ax=ax1).axis("off")
sns.heatmap(heatmap_data.iloc[:, 1:], cmap='Greys', cbar=False, ax=ax2).axis("off")

# Add text annotations below the plots
fig.text(0.09, -0.03, "Label", fontsize=14, ha="center")
fig.text(0.55, -0.03, "Features selected by Random Forest", fontsize=14, ha="center")

# Add the frame around the plot
rect = patches.Rectangle(
    (0, 0), 1, 1, 
    linewidth=1, edgecolor="black", facecolor="none", transform=fig.transFigure, figure=fig
)
fig.patches.append(rect)

plt.tight_layout()
plt.savefig("Plots/heatmap_sorted_features.png", format="png", dpi=300, bbox_inches="tight")
plt.show()

# Generate heatmap before Random Forest
rest_cols = data_raw.iloc[:, 1:131]
fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(rest_cols, cmap='Greys', cbar=False, ax=ax)
ax.axis("off")
fig.text(0.5, -0.03, "Original dataset", fontsize=14, ha="center")

# Again add a frame around the plot
rect = patches.Rectangle(
    (0, 0), 1, 1,  
    linewidth=1, edgecolor="black", facecolor="none", transform=fig.transFigure, figure=fig
)
fig.patches.append(rect)

plt.tight_layout()
plt.savefig("Plots/figure_5.png", format="png", dpi=300, bbox_inches="tight")
plt.show()