In [1]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA


In [2]:
filepath = "./GEO Samples/Normalized_samples_with_lables.csv"
df = pd.read_csv(filepath)

# Each class number of sample set

# 0    300
# 1    265
# 2     72
# 3    331
# 4     32

In [3]:
# Repute class 2 and 4 to around 300 sample set == Total of 1472 Samples

# Class 4: originally has 32 samples. We repute each one 8 times = 288

# Filter rows where Stage == 4
df_stage_4 = df[df['Stage'] == 4]

# Repeat those rows n times
n = 8
df_repeated = pd.concat([df_stage_4] * n, ignore_index=True)

# Combine with the original DataFrame
df = pd.concat([df, df_repeated], ignore_index=True)

# Class 2: originally has 72 samples. We repute each one 3 times = 288
df_stage_2 = df[df['Stage'] == 2]
n = 3
df_repeated = pd.concat([df_stage_2] * n, ignore_index=True)
df = pd.concat([df, df_repeated], ignore_index=True)

df['Stage'].value_counts()

Stage
3    331
0    300
2    288
4    288
1    265
Name: count, dtype: int64

In [4]:
# Shuffle rows
data = df.sample(frac=1).reset_index(drop=True)


y = data.loc[:, data.columns == 'Stage']
x = data.loc[:, data.columns != 'Stage']


In [5]:
col = data.columns
filepath = "./GEO Samples/"

with open(filepath+"list of miRNA parts.txt", "w") as txt_file:
    for line in col:
        txt_file.write(line + "\n") 

In [6]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

k = 100
mi_selector = SelectKBest(score_func=mutual_info_classif, k=k)
x_mi_selected = mi_selector.fit_transform(x, y["Stage"])

# Get the actual indices of selected features
mi_indices = mi_selector.get_support(indices=True)
mi_indices

array([  28,   33,   68,  154,  157,  218,  222,  234,  251,  298,  301,
        417,  432,  437,  460,  472,  499,  542,  546,  569,  570,  579,
        621,  634,  702,  703,  712,  713,  767,  834,  876,  914,  938,
        970,  991, 1005, 1014, 1022, 1039, 1080, 1101, 1140, 1150, 1168,
       1179, 1181, 1190, 1193, 1213, 1246, 1263, 1299, 1313, 1319, 1422,
       1460, 1502, 1503, 1505, 1509, 1511, 1547, 1550, 1566, 1575, 1576,
       1577, 1580, 1606, 1699, 1749, 1793, 1802, 1803, 1806, 1815, 1879,
       1951, 1961, 1962, 1978, 2025, 2054, 2084, 2085, 2101, 2109, 2149,
       2197, 2213, 2219, 2223, 2226, 2229, 2231, 2256, 2286, 2358, 2444,
       2542])

In [7]:
#rain Random Forest on full data and get feature importances

from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(x, y["Stage"])
importances = rf.feature_importances_
importances

array([0.0032101 , 0.00031869, 0.0001376 , ..., 0.00027819, 0.00022012,
       0.0004945 ], shape=(2550,))

In [8]:
# Get top k feature indices by model importance
top_k_model_indices = np.argsort(importances)[::-1][:k]

# Compare overlap
overlap = set(mi_indices) & set(top_k_model_indices)
print(f"Overlap count: {len(overlap)} out of {k}")
print(f"Overlap indices: {sorted(overlap)}")


Overlap count: 40 out of 100
Overlap indices: [np.int64(28), np.int64(68), np.int64(154), np.int64(218), np.int64(234), np.int64(251), np.int64(417), np.int64(432), np.int64(542), np.int64(702), np.int64(713), np.int64(767), np.int64(876), np.int64(970), np.int64(1005), np.int64(1039), np.int64(1140), np.int64(1181), np.int64(1313), np.int64(1319), np.int64(1460), np.int64(1503), np.int64(1509), np.int64(1550), np.int64(1576), np.int64(1606), np.int64(1802), np.int64(1803), np.int64(1806), np.int64(1815), np.int64(1961), np.int64(1962), np.int64(2054), np.int64(2084), np.int64(2101), np.int64(2109), np.int64(2197), np.int64(2231), np.int64(2444), np.int64(2542)]


In [9]:
x_validated = x.iloc[:, sorted(overlap)]


In [10]:
filepath = "./GEO Samples/dimension_reduced_data.csv"
x_validated.to_csv(filepath, index=False)

In [10]:
filepath = "./GEO Samples/dimension_reduced_data_lables.csv"
y.to_csv(filepath, index=False)

In [11]:
x_validated

Unnamed: 0,MIMAT0005905,MIMAT0027511,MIMAT0027592,MIMAT0022260,MIMAT0030996,MIMAT0004602,MIMAT0007881,MIMAT0015058,MIMAT0027532,MIMAT0027496,...,MIMAT0019015,MIMAT0019870,MIMAT0019710,MIMAT0004982,MIMAT0005586,MIMAT0016904,MIMAT0018925,MIMAT0003308,MIMAT0023714,MIMAT0027678
0,-0.563530,-0.676899,-0.613754,-0.616112,-0.617573,-0.209904,-0.568578,-0.652115,-0.665888,-0.560236,...,-0.475093,-0.661859,-0.474133,-0.570284,-0.564518,-0.666483,-0.560242,-0.589134,-0.649169,-0.576645
1,0.619793,0.486105,0.239452,-0.103068,0.030734,-0.142678,0.109433,-0.005533,-0.053617,-0.263061,...,-0.164813,0.045862,-0.028877,-0.134391,0.296972,0.466287,-0.209197,0.279078,0.267280,-0.158757
2,-0.192372,-0.213598,-0.114211,-0.357493,-0.416196,-0.185084,-0.317175,-0.390730,-0.330464,-0.414441,...,-0.306740,-0.276968,-0.232986,-0.408917,-0.370074,-0.334135,-0.324663,-0.259967,-0.220953,-0.395902
3,-0.450547,-0.555213,-0.405803,-0.465161,-0.526437,-0.206608,-0.526885,-0.565866,-0.587601,-0.504205,...,-0.351150,-0.257355,-0.473669,-0.500121,-0.498729,-0.522012,-0.510337,-0.561274,-0.570246,-0.526629
4,1.515960,0.847589,0.908118,0.716764,1.995012,1.040899,1.082713,2.398670,1.358688,1.114047,...,0.885064,0.368244,0.701045,0.819741,1.397610,0.377175,1.403499,1.838532,1.326168,1.148812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,-0.714243,-0.721779,-0.746004,-0.659634,-0.555454,-0.229900,-0.556333,-0.658862,-0.652118,-0.572565,...,-0.511474,-0.841073,-0.624997,-0.607600,-0.595050,-0.648511,-0.580652,-0.552915,-0.608095,-0.586513
1468,-0.582841,-0.506387,-0.517910,-0.527595,-0.575699,-0.207736,-0.515139,-0.556781,-0.579881,-0.511285,...,-0.446093,-0.645588,-0.441961,-0.533745,-0.540749,-0.594572,-0.512850,-0.522752,-0.524199,-0.488406
1469,-0.681824,-0.634754,-0.639045,-0.571517,-0.521322,-0.226805,-0.504757,-0.580098,-0.605771,-0.499001,...,-0.480643,-0.675852,-0.584912,-0.565984,-0.492167,-0.584662,-0.533427,-0.500716,-0.582693,-0.549020
1470,-0.540930,-0.602789,-0.543325,-0.552407,-0.551633,-0.207236,-0.546044,-0.574144,-0.586035,-0.530142,...,-0.434288,-0.450068,-0.486719,-0.499275,-0.488984,-0.498031,-0.523334,-0.602165,-0.596793,-0.538932
