In [85]:
# Group 33, Florida Atlantic University
# Recursive Feature Elimination model
# 1/14/25

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Extracting Data 
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')

In [86]:
labeled_miRNA_data.head(5)

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,stage,subtype
0,7314.747386,7391.483138,7334.393081,10994.201497,471.496698,318.193106,1156.241547,3272.099771,3363.611772,442.783758,...,0.0,0.0,1.847031,0,40.298863,35.429417,148.602058,12118.707689,1,2
1,9518.042994,9460.443528,9574.874468,17578.281899,785.810318,358.652676,771.986446,3871.452122,3917.224498,487.829079,...,0.0,128.562009,4.607957,0,8.60152,38.86044,111.512567,7471.802757,1,2
2,4479.97634,4387.407628,4447.955716,12394.31011,404.624244,855.241747,246.267705,1353.016896,1415.311564,416.8503,...,0.0,161.267504,1.746579,0,33.767203,31.43843,168.253822,16026.613214,1,2
3,21277.962603,21166.590502,21255.800397,15161.474118,6684.570363,503.278464,2185.922959,15012.229891,14987.262342,1107.549261,...,0.0,1.683206,10.660302,0,5.049617,95.101114,1416.978551,12750.562682,1,2
4,8002.355461,8013.396682,8033.638922,19358.942067,1276.411235,765.754731,593.005616,2630.801098,2649.43316,367.580673,...,0.0,97.990843,3.450382,0,22.77252,46.235116,455.450396,14401.203493,1,2


In [87]:
print(labeled_miRNA_data.shape)

(1091, 1883)


# Preprocessing

In [88]:
# Seperating data from labels
pos_neg_labels, stage_labels, subtype_labels = labeled_miRNA_data.iloc[:, -2], labeled_miRNA_data.iloc[:, -2], labeled_miRNA_data.iloc[:, -1]

# Condensing stage information into a general diagnosis
# 0 == negative, 1 == posistive for lung cancer
pos_neg_labels = pos_neg_labels.apply(lambda x: 1 if x > 1 else x)

miRNA_data = labeled_miRNA_data.iloc[:, : -2]

In [89]:
# Logistic Regression responds better to scaled data
scaler = StandardScaler()
miRNA_data_scaled_np = scaler.fit_transform(miRNA_data)

# Diagnosis level feature selection
### i.e negative vs posistive cases

In [90]:
estimator_pos_neg = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')
selector_pos_neg = RFE(estimator_pos_neg, n_features_to_select=10,step=10)
selector_pos_neg = selector_pos_neg.fit(miRNA_data_scaled_np, pos_neg_labels)

In [91]:
print(len(selector_pos_neg.support_))

1881


In [92]:
miRNA_list = miRNA_data.columns.to_list()

def displayRankedFeatures(selector, disp_string):
    top_features_mask = selector.support_
    top_features_indices = [i for i, selected in enumerate(top_features_mask) if selected]

    print(disp_string)

    for idx in top_features_indices:
        print(miRNA_list[idx])

In [93]:
displayRankedFeatures(selector_pos_neg, "Top 10 ranked features for general diagnosis of lung cancer")

Top 10 ranked features for general diagnosis of lung cancer
hsa-mir-3688-1
hsa-mir-4435-2
hsa-mir-4487
hsa-mir-4647
hsa-mir-4661
hsa-mir-4678
hsa-mir-4745
hsa-mir-501
hsa-mir-597
hsa-mir-639


# Stage level feature selection

In [94]:
estimator_stage = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')
selector_stage = RFE(estimator_stage, n_features_to_select=10,step=10)
selector_stage = selector_stage.fit(miRNA_data_scaled_np, stage_labels)

In [95]:
displayRankedFeatures(selector_stage, "Top 10 ranked features for stage prediction of lung cancer")

Top 10 ranked features for stage prediction of lung cancer
hsa-mir-135a-1
hsa-mir-3125
hsa-mir-3653
hsa-mir-3927
hsa-mir-4461
hsa-mir-4487
hsa-mir-4730
hsa-mir-5690
hsa-mir-607
hsa-mir-6506


# Subtype level feature selection

In [96]:
estimator_subtype = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')
selector_subtype = RFE(estimator_subtype, n_features_to_select=10,step=10)
selector_subtype = selector_subtype.fit(miRNA_data_scaled_np, subtype_labels)

In [97]:
displayRankedFeatures(selector_subtype, "Top 10 ranked features for subtype prediction of lung cancer")

Top 10 ranked features for subtype prediction of lung cancer
hsa-mir-1303
hsa-mir-3688-1
hsa-mir-378d-2
hsa-mir-378i
hsa-mir-4487
hsa-mir-4647
hsa-mir-4678
hsa-mir-5690
hsa-mir-5696
hsa-mir-6764


# Selecting features specific for each stage

In [98]:
# Segmenting the data based on stage 0 - 4

stage_0_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 0] # healthy patients
stage_1_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 1]
stage_2_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 2]
stage_3_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 3]
stage_4_df = labeled_miRNA_data[labeled_miRNA_data["stage"] == 4]

stage_0_data, stage_0_labels = stage_0_df.iloc[:, : -2], stage_0_df.iloc[:, -2]
stage_1_data, stage_1_labels = stage_1_df.iloc[:, : -2], stage_1_df.iloc[:, -2]
stage_2_data, stage_2_labels = stage_2_df.iloc[:, : -2], stage_2_df.iloc[:, -2]
stage_3_data, stage_3_labels = stage_3_df.iloc[:, : -2], stage_3_df.iloc[:, -2]
stage_4_data, stage_4_labels = stage_4_df.iloc[:, : -2], stage_4_df.iloc[:, -2]


In [99]:
# Segmenting the data based on subtype 0 - 5

# subtype

subtype_0_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 0] # healthy patients
subtype_1_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 1]
subtype_2_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 2]
subtype_3_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 3]
subtype_4_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 4]
subtype_5_df = labeled_miRNA_data[labeled_miRNA_data["subtype"] == 5]

subtype_0_data, subtype_0_labels = subtype_0_df.iloc[:, : -2], subtype_0_df.iloc[:, -1]
subtype_1_data, subtype_1_labels = subtype_1_df.iloc[:, : -2], subtype_1_df.iloc[:, -1]
subtype_2_data, subtype_2_labels = subtype_2_df.iloc[:, : -2], subtype_2_df.iloc[:, -1]
subtype_3_data, subtype_3_labels = subtype_3_df.iloc[:, : -2], subtype_3_df.iloc[:, -1]
subtype_4_data, subtype_4_labels = subtype_4_df.iloc[:, : -2], subtype_4_df.iloc[:, -1]
subtype_5_data, subtype_5_labels = subtype_5_df.iloc[:, : -2], subtype_5_df.iloc[:, -1]

In [100]:
def parameterSpecficSelection(healthy_data, unhealthy_data, healthy_labels, unhealthy_labels, parameter_name):

    X = pd.concat([healthy_data, unhealthy_data])
    Y = pd.concat([healthy_labels, unhealthy_labels])

    X = scaler.fit_transform(X)

    estimator = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')
    selector = RFE(estimator, n_features_to_select=10,step=10)
    selector = selector.fit(X, Y)

    displayRankedFeatures(selector, f'Top 10 ranked features for {parameter_name} lung cancer')


### Healthy vs Stage I

In [101]:
parameterSpecficSelection(stage_0_data, stage_1_data, stage_0_labels, stage_1_labels, "Stage I" )

Top 10 ranked features for Stage I lung cancer
hsa-mir-19a
hsa-mir-3125
hsa-mir-4487
hsa-mir-4505
hsa-mir-4661
hsa-mir-4733
hsa-mir-4745
hsa-mir-5571
hsa-mir-5690
hsa-mir-607


### Healthy vs Stage II

In [102]:
parameterSpecficSelection(stage_0_data, stage_2_data, stage_0_labels, stage_2_labels, "Stage II" )

Top 10 ranked features for Stage II lung cancer
hsa-mir-1282
hsa-mir-2114
hsa-mir-30b
hsa-mir-3683
hsa-mir-449a
hsa-mir-4663
hsa-mir-571
hsa-mir-6868
hsa-mir-6872
hsa-mir-7849


### Healthy vs Stage III

In [103]:
parameterSpecficSelection(stage_0_data, stage_3_data, stage_0_labels, stage_3_labels, "Stage III" )

Top 10 ranked features for Stage III lung cancer
hsa-mir-122
hsa-mir-3193
hsa-mir-4729
hsa-mir-4770
hsa-mir-497
hsa-mir-5681a
hsa-mir-5690
hsa-mir-571
hsa-mir-6507
hsa-mir-6879


### Healthy vs Stage IV

In [104]:
parameterSpecficSelection(stage_0_data, stage_4_data, stage_0_labels, stage_4_labels, "Stage IV" )

Top 10 ranked features for Stage IV lung cancer
hsa-mir-3162
hsa-mir-3202-1
hsa-mir-3936
hsa-mir-4679-2
hsa-mir-571
hsa-mir-6506
hsa-mir-6879
hsa-mir-7113
hsa-mir-7843
hsa-mir-7848


# Selecting specific features for each subtype 

### Healthy vs Adenocarcinoma

In [105]:
parameterSpecficSelection(subtype_0_data, subtype_1_data, subtype_0_labels, subtype_1_labels, "adenocarcinoma") 

Top 10 ranked features for adenocarcinoma lung cancer
hsa-mir-218-1
hsa-mir-3688-1
hsa-mir-378d-2
hsa-mir-4487
hsa-mir-4647
hsa-mir-4661
hsa-mir-4678
hsa-mir-500b
hsa-mir-607
hsa-mir-7111


### Healthy vs Squamous cell carcinoma

In [106]:
parameterSpecficSelection(subtype_0_data, subtype_2_data, subtype_0_labels, subtype_2_labels, "squamous cell carcinoma")

Top 10 ranked features for squamous cell carcinoma lung cancer
hsa-mir-1294
hsa-mir-19a
hsa-mir-3171
hsa-mir-4661
hsa-mir-4678
hsa-mir-4795
hsa-mir-483
hsa-mir-497
hsa-mir-607
hsa-mir-6868


### Heathy vs large cell*

There are currently no samples of large cell lung cancer in the data set

### Healthy vs Mesothelimoa

In [107]:
parameterSpecficSelection(subtype_0_data, subtype_4_data, subtype_0_labels, subtype_4_labels, "mesothelimoa")

Top 10 ranked features for mesothelimoa lung cancer
hsa-mir-3659
hsa-mir-3686
hsa-mir-4420
hsa-mir-4466
hsa-mir-4498
hsa-mir-4505
hsa-mir-4512
hsa-mir-4701
hsa-mir-6811
hsa-mir-6863


### Healthy vs Small cell*

There are currently no samples of large cell lung cancer in the data set