#### Group 33, Florida Atlantic University </br> Chi-Squared Feature Selection for miRNA Data

> Import Libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

> Load Dataset

In [10]:
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')
assert not labeled_miRNA_data.isna().any().any(), "NaN values detected in input data"
print("Dataset columns:", labeled_miRNA_data.columns)
print("Dataset shape:", labeled_miRNA_data.shape)

Dataset columns: Index(['hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-let-7b',
       'hsa-let-7c', 'hsa-let-7d', 'hsa-let-7e', 'hsa-let-7f-1',
       'hsa-let-7f-2', 'hsa-let-7g',
       ...
       'hsa-mir-943', 'hsa-mir-944', 'hsa-mir-95', 'hsa-mir-9500',
       'hsa-mir-96', 'hsa-mir-98', 'hsa-mir-99a', 'hsa-mir-99b', 'stage',
       'subtype'],
      dtype='object', length=1883)
Dataset shape: (1091, 1883)


> Separate Features and Target

In [11]:
X = labeled_miRNA_data.iloc[:, :-2]  # Exclude last two columns, assuming they are 'stage' & 'subtype'
y = labeled_miRNA_data['stage']  # Target variable

print("Shape of features (X):", X.shape)
print("Shape of target labels (y):", y.shape)

Shape of features (X): (1091, 1881)
Shape of target labels (y): (1091,)


> Normalize Features

In [12]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_normalized = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns
)
assert not X_normalized.isna().any().any(), "NaN values detected after normalization"
assert (X_normalized >= 0).all().all(), "Negative values detected after normalization"
print("Normalized features:")
print(X_normalized.head())

Normalized features:
   hsa-let-7a-1  hsa-let-7a-2  hsa-let-7a-3  hsa-let-7b  hsa-let-7c  \
0      0.153200      0.154740      0.151317    0.038404    0.013304   
1      0.212347      0.210383      0.211152    0.063621    0.024217   
2      0.077101      0.073947      0.074232    0.043767    0.010982   
3      0.528039      0.525213      0.523103    0.054365    0.229020   
4      0.171658      0.171466      0.169991    0.070441    0.041250   

   hsa-let-7d  hsa-let-7e  hsa-let-7f-1  hsa-let-7f-2  hsa-let-7g  ...  \
0    0.008747    0.155592      0.108634      0.109582    0.160672  ...   
1    0.010103    0.099468      0.130015      0.129169    0.183103  ...   
2    0.026749    0.022682      0.040175      0.040651    0.147759  ...   
3    0.014951    0.305986      0.527441      0.520830    0.491702  ...   
4    0.023749    0.073326      0.085757      0.084314    0.123224  ...   

   hsa-mir-941-5  hsa-mir-942  hsa-mir-943  hsa-mir-944  hsa-mir-95  \
0            0.0     0.026466       

> Perform Chi-Squared Feature Selection


In [13]:
chi2_selector = SelectKBest(score_func=chi2, k=10)
X_kbest = chi2_selector.fit_transform(X_normalized, y)
print("Shape of reduced features (X_kbest):", X_kbest.shape)

Shape of reduced features (X_kbest): (1091, 10)


> Create Ranked Feature DataFrame


In [14]:
chi2_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': chi2_selector.scores_
})

# Sort by score in descending order
chi2_ranked_df = chi2_scores.sort_values(by='Score', ascending=False)

# Ensure feature names are standardized
chi2_ranked_df['Feature'] = chi2_ranked_df['Feature'].astype(str).str.strip()

print("Ranked features:")
print(chi2_ranked_df.head())

Ranked features:
           Feature      Score
1367   hsa-mir-571  71.190977
944   hsa-mir-4663  38.356704
659   hsa-mir-3973  34.193548
1824  hsa-mir-8082  34.193548
881   hsa-mir-451b  18.508169


> Handle NaN Scores

In [15]:
print("Features with NaN scores:")
print(chi2_ranked_df[chi2_ranked_df['Score'].isna()])

chi2_ranked_df.dropna(subset=['Score'], inplace=True)  # Drop rows with NaN scores
print("Cleaned ranked features:")
print(chi2_ranked_df.head())

Features with NaN scores:
             Feature  Score
18    hsa-mir-103b-1    NaN
19    hsa-mir-103b-2    NaN
32      hsa-mir-1183    NaN
41      hsa-mir-1200    NaN
42      hsa-mir-1202    NaN
...              ...    ...
1868   hsa-mir-941-2    NaN
1869   hsa-mir-941-3    NaN
1870   hsa-mir-941-4    NaN
1871   hsa-mir-941-5    NaN
1876    hsa-mir-9500    NaN

[245 rows x 2 columns]
Cleaned ranked features:
           Feature      Score
1367   hsa-mir-571  71.190977
944   hsa-mir-4663  38.356704
659   hsa-mir-3973  34.193548
1824  hsa-mir-8082  34.193548
881   hsa-mir-451b  18.508169


> Save and Display Results

In [16]:
chi2_ranked_df.to_csv('./chi_squared_features.csv', index=False)
print("Saved chi-squared ranked features to '../results/chi_squared_features.csv'")

Saved chi-squared ranked features to '../results/chi_squared_features.csv'
