Import Libraries

In [6]:
# Group 33, Florida Atlantic University
# Random Forest Feature Selection model
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

Load & Process Dataset

In [7]:
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')
print(labeled_miRNA_data.shape)

(1091, 1883)


Separate Features & Target

In [8]:
# Separating data from labels
pos_neg_labels, stage_labels, subtype_labels = (
    labeled_miRNA_data.iloc[:, -2],
    labeled_miRNA_data.iloc[:, -2],
    labeled_miRNA_data.iloc[:, -1],
)

# Condensing stage information into a general diagnosis
# 0 == negative, 1 == positive for lung cancer
pos_neg_labels = pos_neg_labels.apply(lambda x: 1 if x > 1 else x)

miRNA_data = labeled_miRNA_data.iloc[:, :-2]

Scaling the Data

In [9]:
scaler = StandardScaler()
miRNA_data_scaled_np = scaler.fit_transform(miRNA_data)

Random Forest Feature Selection

In [12]:
# Diagnosis level feature selection using Random Forest
def performRFSelection(X, y, feature_names, disp_string, save_path):
    """
    Perform Random Forest feature selection.

    Parameters:
        X (array): Feature data
        y (array): Labels
        feature_names (list): List of feature names
        disp_string (str): Description of the process
        save_path (str): Path to save the results

    Returns:
        None
    """
    rf = RandomForestClassifier(n_estimators=1000, class_weight="balanced", random_state=42)
    rf.fit(X, y)

    # Determine Feature Importance
    importances = rf.feature_importances_
    importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
    importance_df = importance_df.sort_values(by="Importance", ascending=False)

    top_features = importance_df.head(10)
    print(disp_string)
    print(top_features)

    # top_features.to_csv(save_path, index=False)

    # Plotting feature importances
    # plt.figure(figsize=(10, 6))
    # sns.barplot(data=top_features, x="Importance", y="Feature", palette="viridis")
    # plt.title(f"Top 10 Features by Random Forest: {disp_string}")
    # plt.tight_layout()
    # plt.savefig(save_path.replace(".csv", ".png"))
    # plt.show()

In [13]:
miRNA_list = miRNA_data.columns.to_list()

In [14]:
# Diagnosis level
performRFSelection(
    miRNA_data_scaled_np,
    pos_neg_labels,
    miRNA_list,
    "Top 10 features for general diagnosis of lung cancer",
    "../results/random_forest/feature_selection_general.csv"
)

Top 10 features for general diagnosis of lung cancer
            Feature  Importance
317     hsa-mir-23c    0.015994
272     hsa-mir-202    0.015567
324   hsa-mir-26a-1    0.014352
162    hsa-mir-1304    0.012038
353     hsa-mir-30b    0.011801
325   hsa-mir-26a-2    0.011089
667     hsa-mir-411    0.010557
1341   hsa-mir-5690    0.010296
335     hsa-mir-299    0.009710
485   hsa-mir-329-2    0.008340


In [15]:
# Stage level
performRFSelection(
    miRNA_data_scaled_np,
    stage_labels,
    miRNA_list,
    "Top 10 features for stage prediction of lung cancer",
    "../results/random_forest/feature_selection_stage.csv"
)

Top 10 features for stage prediction of lung cancer
             Feature  Importance
231      hsa-mir-185    0.003067
353      hsa-mir-30b    0.003040
324    hsa-mir-26a-1    0.002844
272      hsa-mir-202    0.002732
253      hsa-mir-195    0.002657
325    hsa-mir-26a-2    0.002609
508     hsa-mir-3610    0.002568
1127     hsa-mir-504    0.002392
162     hsa-mir-1304    0.002363
179   hsa-mir-135a-2    0.002353


In [16]:
# Subtype level
performRFSelection(
    miRNA_data_scaled_np,
    subtype_labels,
    miRNA_list,
    "Top 10 features for subtype prediction of lung cancer",
    "../results/random_forest/feature_selection_subtype.csv"
)

Top 10 features for subtype prediction of lung cancer
           Feature  Importance
1447   hsa-mir-615    0.009278
1077  hsa-mir-4792    0.007812
568   hsa-mir-3686    0.006658
240   hsa-mir-190b    0.006301
168    hsa-mir-132    0.006166
499    hsa-mir-34b    0.005997
584   hsa-mir-3692    0.005936
1554  hsa-mir-6738    0.005705
592   hsa-mir-374a    0.005658
1689  hsa-mir-6863    0.005571
