In [1]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [6]:
filepath = "./GEO Samples/Normalized_samples_with_lables.csv"
df = pd.read_csv(filepath)

# Each class number of sample set

# 0    300
# 1    265
# 2     72
# 3    331
# 4     32

In [7]:
# Repute class 2 and 4 to around 300 sample set == Total of 1472 Samples

# Class 4: originally has 32 samples. We repute each one 8 times = 288

# Filter rows where Stage == 4
df_stage_4 = df[df['Stage'] == 4]

# Repeat those rows n times
n = 8
df_repeated = pd.concat([df_stage_4] * n, ignore_index=True)

# Combine with the original DataFrame
df = pd.concat([df, df_repeated], ignore_index=True)

# Class 2: originally has 72 samples. We repute each one 3 times = 288
df_stage_2 = df[df['Stage'] == 2]
n = 3
df_repeated = pd.concat([df_stage_2] * n, ignore_index=True)
df = pd.concat([df, df_repeated], ignore_index=True)

df['Stage'].value_counts()

Stage
3    331
0    300
2    288
4    288
1    265
Name: count, dtype: int64

In [10]:
# Shuffle rows
data = df.sample(frac=1).reset_index(drop=True)


y = data.loc[:, data.columns == 'Stage']
x = data.loc[:, data.columns != 'Stage']


In [15]:
col = data.columns
filepath = "./GEO Samples/"

with open(filepath+"list of miRNA parts.txt", "w") as txt_file:
    for line in col:
        txt_file.write(line + "\n") 

In [3]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

k = 100
mi_selector = SelectKBest(score_func=mutual_info_classif, k=k)
x_mi_selected = mi_selector.fit_transform(x, y["Stage"])

# Get the actual indices of selected features
mi_indices = mi_selector.get_support(indices=True)
mi_indices

array([  28,   68,   76,  150,  218,  234,  251,  417,  432,  437,  460,
        472,  499,  542,  546,  569,  570,  579,  597,  621,  634,  702,
        703,  712,  713,  731,  767,  828,  834,  876,  914,  970, 1014,
       1022, 1036, 1039, 1101, 1107, 1140, 1150, 1168, 1179, 1181, 1183,
       1190, 1193, 1213, 1246, 1270, 1299, 1313, 1319, 1454, 1460, 1502,
       1503, 1505, 1509, 1547, 1550, 1575, 1576, 1606, 1646, 1693, 1699,
       1713, 1749, 1774, 1793, 1802, 1803, 1806, 1815, 1879, 1951, 1961,
       1962, 1970, 1978, 2025, 2040, 2054, 2084, 2085, 2101, 2109, 2137,
       2197, 2213, 2219, 2229, 2231, 2256, 2286, 2296, 2299, 2358, 2444,
       2456], dtype=int64)

In [4]:
#rain Random Forest on full data and get feature importances

from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(x, y["Stage"])
importances = rf.feature_importances_
importances

array([2.75018536e-03, 2.13597712e-04, 6.07382826e-04, ...,
       3.16028925e-04, 2.06400666e-05, 3.75611971e-04])

In [5]:
# Get top k feature indices by model importance
top_k_model_indices = np.argsort(importances)[::-1][:k]

# Compare overlap
overlap = set(mi_indices) & set(top_k_model_indices)
print(f"Overlap count: {len(overlap)} out of {k}")
print(f"Overlap indices: {sorted(overlap)}")


Overlap count: 43 out of 100
Overlap indices: [68, 150, 218, 234, 251, 417, 432, 472, 542, 702, 767, 834, 876, 914, 970, 1039, 1107, 1140, 1168, 1183, 1270, 1319, 1460, 1503, 1509, 1550, 1576, 1606, 1774, 1802, 1806, 1815, 1961, 1962, 2054, 2084, 2101, 2109, 2137, 2197, 2231, 2299, 2358]


In [6]:
x_validated = x.iloc[:, sorted(overlap)]


In [9]:
filepath = "./GEO Samples/dimension_reduced_data.csv"
x_validated.to_csv(filepath, index=False)

In [10]:
filepath = "./GEO Samples/dimension_reduced_data_lables.csv"
y.to_csv(filepath, index=False)