In [None]:
# Group 33, Florida Atlantic University
# Recursive Feature Elimination model
# 1/14/25

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Extracting Data 
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')

In [None]:
labeled_miRNA_data.head(5)

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,stage,subtype
0,7314.747386,7391.483138,7334.393081,10994.201497,471.496698,318.193106,1156.241547,3272.099771,3363.611772,442.783758,...,0.0,0.0,1.847031,0,40.298863,35.429417,148.602058,12118.707689,1,2
1,9518.042994,9460.443528,9574.874468,17578.281899,785.810318,358.652676,771.986446,3871.452122,3917.224498,487.829079,...,0.0,128.562009,4.607957,0,8.60152,38.86044,111.512567,7471.802757,1,2
2,4479.97634,4387.407628,4447.955716,12394.31011,404.624244,855.241747,246.267705,1353.016896,1415.311564,416.8503,...,0.0,161.267504,1.746579,0,33.767203,31.43843,168.253822,16026.613214,1,2
3,21277.962603,21166.590502,21255.800397,15161.474118,6684.570363,503.278464,2185.922959,15012.229891,14987.262342,1107.549261,...,0.0,1.683206,10.660302,0,5.049617,95.101114,1416.978551,12750.562682,1,2
4,8002.355461,8013.396682,8033.638922,19358.942067,1276.411235,765.754731,593.005616,2630.801098,2649.43316,367.580673,...,0.0,97.990843,3.450382,0,22.77252,46.235116,455.450396,14401.203493,1,2


In [None]:
print(labeled_miRNA_data.shape)

(1091, 1883)


# Preprocessing

In [None]:
# Separating data from labels
pos_neg_labels = labeled_miRNA_data.iloc[:, -2]  # Binary cancer diagnosis (0 or 1)
stage_labels = labeled_miRNA_data.iloc[:, -3]  # Stage classification
subtype_labels = labeled_miRNA_data.iloc[:, -1]  # Subtype classification

# Condensing stage information into a general diagnosis (0 = healthy, 1 = cancer)
pos_neg_labels = pos_neg_labels.apply(lambda x: 1 if x > 1 else x)

# Extract only miRNA feature data
miRNA_data = labeled_miRNA_data.iloc[:, :-3]

In [None]:
# Logistic Regression responds better to scaled data
scaler = StandardScaler()
miRNA_data_scaled_np = scaler.fit_transform(miRNA_data)

# Convert back to DataFrame
miRNA_data_scaled = pd.DataFrame(miRNA_data_scaled_np, columns=miRNA_data.columns)

# Confirm transformations
print(f"miRNA data shape: {miRNA_data_scaled.shape}")
print(f"Stage label distribution:\n{stage_labels.value_counts()}")
print(f"Subtype label distribution:\n{subtype_labels.value_counts()}")

miRNA data shape: (1091, 1880)
Stage label distribution:
hsa-mir-99b
23146.607246    1
12118.707689    1
7471.802757     1
27175.592549    1
23838.524323    1
               ..
19091.961442    1
39609.280134    1
19807.131075    1
14401.203493    1
12750.562682    1
Name: count, Length: 1091, dtype: int64
Subtype label distribution:
subtype
1    560
2    519
0     11
4      1
Name: count, dtype: int64


# Diagnosis level feature selection
### i.e negative vs posistive cases

In [None]:
# Define Logistic Regression model for RFE
estimator_pos_neg = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')

# Perform RFE with more flexibility
selector_pos_neg = RFE(estimator_pos_neg, n_features_to_select=50, step=5)
selector_pos_neg = selector_pos_neg.fit(miRNA_data_scaled, pos_neg_labels)  # Use DataFrame, not NumPy array

In [None]:
# print(len(selector_pos_neg.support_))
print(f"Number of selected features: {sum(selector_pos_neg.support_)}")

Number of selected features: 50


In [None]:
def displayRankedFeatures(selector, disp_string, save_path):
    top_features_mask = selector.support_
    top_features_indices = [i for i, selected in enumerate(top_features_mask) if selected]

    print(disp_string)
    
    selected_features = [miRNA_list[idx] for idx in top_features_indices]  # Get actual feature names
    print("\n".join(selected_features))

    # Save actual feature names instead of boolean values
    feature_dataframe = pd.DataFrame({'Feature': selected_features})
    feature_dataframe.to_csv(save_path, index=False)
    print(f"Saved RFE results to: {save_path}")

In [None]:
displayRankedFeatures(selector_pos_neg, 
                      "Top selected features for general diagnosis of lung cancer", 
                      "../results/recursive_feature_elimination/recursive_feature_selection_general.csv")

Top selected features for general diagnosis of lung cancer
hsa-mir-1275
hsa-mir-1294
hsa-mir-135a-1
hsa-mir-181b-1
hsa-mir-186
hsa-mir-193b
hsa-mir-195
hsa-mir-19a
hsa-mir-218-1
hsa-mir-2355
hsa-mir-3125
hsa-mir-3175
hsa-mir-3193
hsa-mir-3199-1
hsa-mir-3202-1
hsa-mir-3654
hsa-mir-3655
hsa-mir-3688-1
hsa-mir-374b
hsa-mir-378d-2
hsa-mir-3907
hsa-mir-425
hsa-mir-4284
hsa-mir-4434
hsa-mir-4435-2
hsa-mir-4487
hsa-mir-4647
hsa-mir-4661
hsa-mir-4678
hsa-mir-4733
hsa-mir-4747
hsa-mir-501
hsa-mir-5094
hsa-mir-5571
hsa-mir-5690
hsa-mir-571
hsa-mir-597
hsa-mir-6507
hsa-mir-6780b
hsa-mir-6808
hsa-mir-6811
hsa-mir-6868
hsa-mir-6872
hsa-mir-6888
hsa-mir-7111
hsa-mir-7151
hsa-mir-7843
hsa-mir-7849
hsa-mir-8085
hsa-mir-938
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_general.csv


# Stage level feature selection

In [22]:
from sklearn.preprocessing import LabelEncoder

# Convert stage labels to categorical integers
le_stage = LabelEncoder()
stage_labels_encoded = le_stage.fit_transform(stage_labels)

# Confirm unique classes
print("Encoded stage labels:", np.unique(stage_labels_encoded))

# Define Logistic Regression model for RFE (stage classification)
estimator_stage = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')

# Perform RFE with more flexibility
selector_stage = RFE(estimator_stage, n_features_to_select=50, step=5)  # Increased features
selector_stage = selector_stage.fit(miRNA_data_scaled, stage_labels_encoded)  # Use encoded labels

Encoded stage labels: [   0    1    2 ... 1088 1089 1090]


In [24]:
# Display & Save the selected features
displayRankedFeatures(selector_stage, 
                      "Top selected features for stage prediction of lung cancer", 
                      "../results/recursive_feature_elimination/recursive_feature_selection_stage.csv")

AttributeError: 'RFE' object has no attribute 'support_'

# Subtype level feature selection

In [13]:
estimator_subtype = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')
selector_subtype = RFE(estimator_subtype, n_features_to_select=10,step=10)
selector_subtype = selector_subtype.fit(miRNA_data_scaled_np, subtype_labels)

In [14]:
displayRankedFeatures(selector_subtype, "Top 10 ranked features for subtype prediction of lung cancer", "../results/recursive_feature_elimination/recursive_feature_selection_subtype.csv")

Top 10 ranked features for subtype prediction of lung cancer
hsa-mir-1303
hsa-mir-3688-1
hsa-mir-378d-2
hsa-mir-378i
hsa-mir-4487
hsa-mir-4647
hsa-mir-4678
hsa-mir-5690
hsa-mir-5696
hsa-mir-6764


# Selecting features specific for each stage

In [29]:
# Print column names to check what's available
print("Column names in dataset:", labeled_miRNA_data.columns)

# Define columns to drop (only if they exist in the dataset)
columns_to_drop = ['stage', 'subtype']
if 'general' in labeled_miRNA_data.columns:
    columns_to_drop.append('general')

# Segment the data based on stage 0 - 4
stage_0_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 0]  # healthy patients
stage_1_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 1]
stage_2_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 2]
stage_3_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 3]
stage_4_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 4]

# Drop only columns that exist in the dataset
stage_0_data, stage_0_labels = stage_0_df.drop(columns=[col for col in columns_to_drop if col in stage_0_df.columns]), stage_0_df["stage"]
stage_1_data, stage_1_labels = stage_1_df.drop(columns=[col for col in columns_to_drop if col in stage_1_df.columns]), stage_1_df["stage"]
stage_2_data, stage_2_labels = stage_2_df.drop(columns=[col for col in columns_to_drop if col in stage_2_df.columns]), stage_2_df["stage"]
stage_3_data, stage_3_labels = stage_3_df.drop(columns=[col for col in columns_to_drop if col in stage_3_df.columns]), stage_3_df["stage"]
stage_4_data, stage_4_labels = stage_4_df.drop(columns=[col for col in columns_to_drop if col in stage_4_df.columns]), stage_4_df["stage"]

# Confirm fix
print("Fixed segmentation! Data shape per stage:")
print(f"Stage 0: {stage_0_data.shape}, Stage 1: {stage_1_data.shape}, Stage 2: {stage_2_data.shape}, Stage 3: {stage_3_data.shape}, Stage 4: {stage_4_data.shape}")

Column names in dataset: Index(['hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-let-7b',
       'hsa-let-7c', 'hsa-let-7d', 'hsa-let-7e', 'hsa-let-7f-1',
       'hsa-let-7f-2', 'hsa-let-7g',
       ...
       'hsa-mir-943', 'hsa-mir-944', 'hsa-mir-95', 'hsa-mir-9500',
       'hsa-mir-96', 'hsa-mir-98', 'hsa-mir-99a', 'hsa-mir-99b', 'stage',
       'subtype'],
      dtype='object', length=1883)
Fixed segmentation! Data shape per stage:
Stage 0: (11, 1881), Stage 1: (566, 1881), Stage 2: (308, 1881), Stage 3: (175, 1881), Stage 4: (31, 1881)


In [None]:
# Print column names to check what's available
print("Column names in dataset:", labeled_miRNA_data.columns)

# Define columns to drop (only if they exist in the dataset)
columns_to_drop = ['stage', 'subtype']
if 'general' in labeled_miRNA_data.columns:
    columns_to_drop.append('general')

# Segmenting the data based on subtype 0 - 5
subtype_0_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 0]  # Subtype 0 patients
subtype_1_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 1]
subtype_2_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 2]
subtype_3_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 3]
subtype_4_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 4]
subtype_5_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 5]

# Drop only label columns that exist
subtype_0_data, subtype_0_labels = subtype_0_df.drop(columns=[col for col in columns_to_drop if col in subtype_0_df.columns]), subtype_0_df["subtype"]
subtype_1_data, subtype_1_labels = subtype_1_df.drop(columns=[col for col in columns_to_drop if col in subtype_1_df.columns]), subtype_1_df["subtype"]
subtype_2_data, subtype_2_labels = subtype_2_df.drop(columns=[col for col in columns_to_drop if col in subtype_2_df.columns]), subtype_2_df["subtype"]
subtype_3_data, subtype_3_labels = subtype_3_df.drop(columns=[col for col in columns_to_drop if col in subtype_3_df.columns]), subtype_3_df["subtype"]
subtype_4_data, subtype_4_labels = subtype_4_df.drop(columns=[col for col in columns_to_drop if col in subtype_4_df.columns]), subtype_4_df["subtype"]
subtype_5_data, subtype_5_labels = subtype_5_df.drop(columns=[col for col in columns_to_drop if col in subtype_5_df.columns]), subtype_5_df["subtype"]

# Confirm fix
print("Fixed segmentation! Data shape per subtype:")
print(f"Subtype 0: {subtype_0_data.shape}, Subtype 1: {subtype_1_data.shape}, Subtype 2: {subtype_2_data.shape}, Subtype 3: {subtype_3_data.shape}, Subtype 4: {subtype_4_data.shape}, Subtype 5: {subtype_5_data.shape}")

Column names in dataset: Index(['hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-let-7b',
       'hsa-let-7c', 'hsa-let-7d', 'hsa-let-7e', 'hsa-let-7f-1',
       'hsa-let-7f-2', 'hsa-let-7g',
       ...
       'hsa-mir-943', 'hsa-mir-944', 'hsa-mir-95', 'hsa-mir-9500',
       'hsa-mir-96', 'hsa-mir-98', 'hsa-mir-99a', 'hsa-mir-99b', 'stage',
       'subtype'],
      dtype='object', length=1883)
Fixed segmentation! Data shape per subtype:
Subtype 0: (11, 1881), Subtype 1: (560, 1881), Subtype 2: (519, 1881), Subtype 3: (0, 1881), Subtype 4: (1, 1881), Subtype 5: (0, 1881)


In [44]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def parameterSpecificSelection(healthy_data, unhealthy_data, healthy_labels, unhealthy_labels, parameter_name, save_path):
    """
    Runs Recursive Feature Elimination (RFE) to select top features for a given classification task.
    
    Args:
        healthy_data (pd.DataFrame): Features for healthy cases
        unhealthy_data (pd.DataFrame): Features for diseased cases
        healthy_labels (pd.Series): Labels for healthy cases
        unhealthy_labels (pd.Series): Labels for diseased cases
        parameter_name (str): Name of the classification task (e.g., "Stage" or "Subtype")
        save_path (str): File path to save selected features
    """

    # Step 1: Validate Data
    if healthy_data.empty or unhealthy_data.empty:
        print(f"❌ ERROR: One of the datasets is empty! Skipping {parameter_name} analysis.")
        return

    if healthy_labels.empty or unhealthy_labels.empty:
        print(f"❌ ERROR: One of the label sets is empty! Skipping {parameter_name} analysis.")
        return

    # Step 2: Combine Data
    X = pd.concat([healthy_data, unhealthy_data])
    Y = pd.concat([healthy_labels, unhealthy_labels])

    # Step 3: Scale Features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Step 4: Encode Labels
    Y_encoded = LabelEncoder().fit_transform(Y)

    # Step 5: Perform Recursive Feature Elimination (RFE)
    estimator = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')
    selector = RFE(estimator, n_features_to_select=20, step=5)  # Adjust `n_features_to_select` as needed
    selector.fit(X_scaled, Y_encoded)

    # Step 6: Save Selected Features
    displayRankedFeatures(selector, f'Top 20 ranked features for {parameter_name} lung cancer', save_path)
    print(f"Feature selection for {parameter_name} completed. Results saved to {save_path}.")

### Healthy vs Stage I

In [45]:
parameterSpecficSelection(stage_0_data, stage_1_data, stage_0_labels, stage_1_labels, "Stage I" , "../results/recursive_feature_elimination/recursive_feature_selection_stage1.csv")

Top 10 ranked features for Stage I lung cancer
hsa-mir-19a
hsa-mir-3125
hsa-mir-4487
hsa-mir-4505
hsa-mir-4661
hsa-mir-4733
hsa-mir-4745
hsa-mir-5571
hsa-mir-5690
hsa-mir-607
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_stage1.csv


### Healthy vs Stage II

In [46]:
parameterSpecficSelection(stage_0_data, stage_2_data, stage_0_labels, stage_2_labels, "Stage II", "../results/recursive_feature_elimination/recursive_feature_selection_stage2.csv")

Top 10 ranked features for Stage II lung cancer
hsa-mir-1282
hsa-mir-2114
hsa-mir-30b
hsa-mir-3683
hsa-mir-449a
hsa-mir-4663
hsa-mir-571
hsa-mir-6868
hsa-mir-6872
hsa-mir-7849
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_stage2.csv


### Healthy vs Stage III

In [47]:
parameterSpecficSelection(stage_0_data, stage_3_data, stage_0_labels, stage_3_labels, "Stage III", "../results/recursive_feature_elimination/recursive_feature_selection_stage3.csv")

Top 10 ranked features for Stage III lung cancer
hsa-mir-122
hsa-mir-3193
hsa-mir-4729
hsa-mir-4770
hsa-mir-497
hsa-mir-5681a
hsa-mir-5690
hsa-mir-571
hsa-mir-6507
hsa-mir-6879
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_stage3.csv


### Healthy vs Stage IV

In [48]:
parameterSpecficSelection(stage_0_data, stage_4_data, stage_0_labels, stage_4_labels, "Stage IV", "../results/recursive_feature_elimination/recursive_feature_selection_stage4.csv")

Top 10 ranked features for Stage IV lung cancer
hsa-mir-3162
hsa-mir-3202-1
hsa-mir-3936
hsa-mir-4679-2
hsa-mir-571
hsa-mir-6506
hsa-mir-6879
hsa-mir-7113
hsa-mir-7843
hsa-mir-7848
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_stage4.csv


# Selecting specific features for each subtype 

### Healthy vs Adenocarcinoma

In [49]:
parameterSpecficSelection(subtype_0_data, subtype_1_data, subtype_0_labels, subtype_1_labels, "adenocarcinoma", "../results/recursive_feature_elimination/recursive_feature_selection_subtype1.csv")

Top 10 ranked features for adenocarcinoma lung cancer
hsa-mir-218-1
hsa-mir-3688-1
hsa-mir-378d-2
hsa-mir-4487
hsa-mir-4647
hsa-mir-4661
hsa-mir-4678
hsa-mir-500b
hsa-mir-607
hsa-mir-7111
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_subtype1.csv


### Healthy vs Squamous cell carcinoma

In [50]:
parameterSpecficSelection(subtype_0_data, subtype_2_data, subtype_0_labels, subtype_2_labels, "squamous cell carcinoma", "../results/recursive_feature_elimination/recursive_feature_selection_subtype2.csv")

Top 10 ranked features for squamous cell carcinoma lung cancer
hsa-mir-1294
hsa-mir-19a
hsa-mir-3171
hsa-mir-4661
hsa-mir-4678
hsa-mir-4795
hsa-mir-483
hsa-mir-497
hsa-mir-607
hsa-mir-6868
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_subtype2.csv


### Heathy vs large cell*

There are currently no samples of large cell lung cancer in the data set

### Healthy vs Mesothelimoa

In [51]:
parameterSpecficSelection(subtype_0_data, subtype_4_data, subtype_0_labels, subtype_4_labels, "mesothelimoa", "../results/recursive_feature_elimination/recursive_feature_selection_subtype4.csv")

Top 10 ranked features for mesothelimoa lung cancer
hsa-mir-2861
hsa-mir-3132
hsa-mir-4498
hsa-mir-4505
hsa-mir-4512
hsa-mir-4701
hsa-mir-4735
hsa-mir-4776-1
hsa-mir-6791
hsa-mir-6863
Saved RFE results to: ../results/recursive_feature_elimination/recursive_feature_selection_subtype4.csv


### Healthy vs Small cell*

There are currently no samples of large cell lung cancer in the data set