In [37]:
# Group 33, Florida Atlantic University
# Recursive Feature Elimination model
# 1/14/25

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Extracting Data 
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')

In [3]:
labeled_miRNA_data.head(5)

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,stage,subtype
0,7314.747386,7391.483138,7334.393081,10994.201497,471.496698,318.193106,1156.241547,3272.099771,3363.611772,442.783758,...,0.0,0.0,1.847031,0,40.298863,35.429417,148.602058,12118.707689,1,2
1,9518.042994,9460.443528,9574.874468,17578.281899,785.810318,358.652676,771.986446,3871.452122,3917.224498,487.829079,...,0.0,128.562009,4.607957,0,8.60152,38.86044,111.512567,7471.802757,1,2
2,4479.97634,4387.407628,4447.955716,12394.31011,404.624244,855.241747,246.267705,1353.016896,1415.311564,416.8503,...,0.0,161.267504,1.746579,0,33.767203,31.43843,168.253822,16026.613214,1,2
3,21277.962603,21166.590502,21255.800397,15161.474118,6684.570363,503.278464,2185.922959,15012.229891,14987.262342,1107.549261,...,0.0,1.683206,10.660302,0,5.049617,95.101114,1416.978551,12750.562682,1,2
4,8002.355461,8013.396682,8033.638922,19358.942067,1276.411235,765.754731,593.005616,2630.801098,2649.43316,367.580673,...,0.0,97.990843,3.450382,0,22.77252,46.235116,455.450396,14401.203493,1,2


In [4]:
print(labeled_miRNA_data.shape)

(1091, 1883)


# Diagnosis level feature selection
### i.e negative vs posistive cases

In [5]:
# condensing the stage information into a simple negative posistve label
labeled_pos_neg_data = labeled_miRNA_data.iloc[:, :-1]

# 0 == negative, 1 == posistive
labeled_pos_neg_data.iloc[:, -1] = labeled_pos_neg_data.iloc[:, -1].apply(lambda x: 1 if x > 1 else x)

In [6]:
labeled_pos_neg_data.head(5)

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,stage
0,7314.747386,7391.483138,7334.393081,10994.201497,471.496698,318.193106,1156.241547,3272.099771,3363.611772,442.783758,...,3.022415,0.0,0.0,1.847031,0,40.298863,35.429417,148.602058,12118.707689,1
1,9518.042994,9460.443528,9574.874468,17578.281899,785.810318,358.652676,771.986446,3871.452122,3917.224498,487.829079,...,6.45114,0.0,128.562009,4.607957,0,8.60152,38.86044,111.512567,7471.802757,1
2,4479.97634,4387.407628,4447.955716,12394.31011,404.624244,855.241747,246.267705,1353.016896,1415.311564,416.8503,...,2.910966,0.0,161.267504,1.746579,0,33.767203,31.43843,168.253822,16026.613214,1
3,21277.962603,21166.590502,21255.800397,15161.474118,6684.570363,503.278464,2185.922959,15012.229891,14987.262342,1107.549261,...,6.452288,0.0,1.683206,10.660302,0,5.049617,95.101114,1416.978551,12750.562682,1
4,8002.355461,8013.396682,8033.638922,19358.942067,1276.411235,765.754731,593.005616,2630.801098,2649.43316,367.580673,...,9.201018,0.0,97.990843,3.450382,0,22.77252,46.235116,455.450396,14401.203493,1


In [7]:
print(labeled_pos_neg_data.iloc[:, -1].value_counts())

stage
1    1080
0      11
Name: count, dtype: int64


In [44]:
# seperating labels and scaling data

pos_neg_data, pos_neg_labels = labeled_pos_neg_data.iloc[:, : -1], labeled_pos_neg_data.iloc[:, -1]

scaler = StandardScaler()
pos_neg_data = scaler.fit_transform(pos_neg_data)

pos_neg_labels.to_numpy()

In [46]:
print(pos_neg_data)

print(pos_neg_labels)

[[-4.49189552e-01 -4.34839070e-01 -4.51949003e-01 ... -8.89456720e-01
  -6.86347418e-01 -5.76650142e-01]
 [-8.38956099e-02 -9.14941776e-02 -8.02910999e-02 ... -7.93288702e-01
  -7.82991265e-01 -7.51970293e-01]
 [-9.19178548e-01 -9.33366730e-01 -9.30760060e-01 ... -1.00131995e+00
  -6.35140943e-01 -4.29211233e-01]
 ...
 [-4.04017136e-01 -3.95440747e-01 -4.06126529e-01 ... -8.32261125e-01
  -5.37784837e-01 -1.52544355e-01]
 [ 1.00209005e-03  1.04902616e-02  6.78057825e-03 ... -1.11845178e+00
  -1.94654343e-01 -3.36702905e-02]
 [-6.88004572e-01 -6.81502330e-01 -7.02486228e-01 ... -2.46900102e-01
  -2.94990953e-01 -1.60585459e-01]]
0       1
1       1
2       1
3       1
4       1
       ..
1086    1
1087    1
1088    1
1089    1
1090    1
Name: stage, Length: 1091, dtype: int64


In [50]:
estimator_pos_neg = LogisticRegression(max_iter=1000)
selector_pos_neg = RFE(estimator_pos_neg, n_features_to_select=10,step=10)
selector_pos_neg = selector_pos_neg.fit(pos_neg_data, pos_neg_labels)

In [51]:
print(len(selector_pos_neg.support_))

1881


In [52]:
top_features_mask_pos_neg = selector_pos_neg.support_
top_features_indices_pos_neg = [i for i, selected in enumerate(top_features_mask_pos_neg) if selected]

miRNA_list = labeled_pos_neg_data.columns.to_list()

print("Top 10 ranked features for general diagnosis of lung cancer")

for idx in top_features_indices_pos_neg:
    print(miRNA_list[idx])

Top 10 ranked features for general diagnosis of lung cancer
hsa-mir-19a
hsa-mir-4435-2
hsa-mir-4487
hsa-mir-4638
hsa-mir-4647
hsa-mir-4661
hsa-mir-4678
hsa-mir-501
hsa-mir-5690
hsa-mir-6811


# Stage level feature selection