#### Group 33, Florida Atlantic University
#### Chi-squared model for feature extraction
#### Adapted from fold-change logic
#### 11/19/24

#### Chi-Squared Feature Selection for miRNA Data

#### > Load Data set

In [17]:
# Load necessary libraries
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')

# Display the first few rows of the dataset
labeled_miRNA_data.head()

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,stage,subtype
0,7314.747386,7391.483138,7334.393081,10994.201497,471.496698,318.193106,1156.241547,3272.099771,3363.611772,442.783758,...,0.0,0.0,1.847031,0,40.298863,35.429417,148.602058,12118.707689,1,2
1,9518.042994,9460.443528,9574.874468,17578.281899,785.810318,358.652676,771.986446,3871.452122,3917.224498,487.829079,...,0.0,128.562009,4.607957,0,8.60152,38.86044,111.512567,7471.802757,1,2
2,4479.97634,4387.407628,4447.955716,12394.31011,404.624244,855.241747,246.267705,1353.016896,1415.311564,416.8503,...,0.0,161.267504,1.746579,0,33.767203,31.43843,168.253822,16026.613214,1,2
3,21277.962603,21166.590502,21255.800397,15161.474118,6684.570363,503.278464,2185.922959,15012.229891,14987.262342,1107.549261,...,0.0,1.683206,10.660302,0,5.049617,95.101114,1416.978551,12750.562682,1,2
4,8002.355461,8013.396682,8033.638922,19358.942067,1276.411235,765.754731,593.005616,2630.801098,2649.43316,367.580673,...,0.0,97.990843,3.450382,0,22.77252,46.235116,455.450396,14401.203493,1,2


#### > Inspect Dataset

In [18]:
# Display dataset columns to verify structure
print("Dataset columns:", labeled_miRNA_data.columns)

# Inspect the shape of the dataset
print("Dataset shape:", labeled_miRNA_data.shape)

Dataset columns: Index(['hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-let-7b',
       'hsa-let-7c', 'hsa-let-7d', 'hsa-let-7e', 'hsa-let-7f-1',
       'hsa-let-7f-2', 'hsa-let-7g',
       ...
       'hsa-mir-943', 'hsa-mir-944', 'hsa-mir-95', 'hsa-mir-9500',
       'hsa-mir-96', 'hsa-mir-98', 'hsa-mir-99a', 'hsa-mir-99b', 'stage',
       'subtype'],
      dtype='object', length=1883)
Dataset shape: (1091, 1883)


#### > Split Features and Labels

In [20]:
# Split features (X) and target labels (y)
# Update the column name 'stage' if necessary
X = labeled_miRNA_data.iloc[:, :-1]  # All columns except the last one (features)
y = labeled_miRNA_data['stage']  # Target labels

# Display shapes of features and labels
print("Shape of features (X):", X.shape)
print("Shape of target labels (y):", y.shape)

Shape of features (X): (1091, 1882)
Shape of target labels (y): (1091,)


#### > Normalize Features

In [22]:
# Normalize the features to ensure all values are non-negative
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Confirm normalization
pd.DataFrame(X_normalized, columns=X.columns).head()

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,stage
0,0.1532,0.15474,0.151317,0.038404,0.013304,0.008747,0.155592,0.108634,0.109582,0.160672,...,0.026466,0.0,0.0,0.012193,0.0,0.172729,0.095439,0.02104,0.025199,0.25
1,0.212347,0.210383,0.211152,0.063621,0.024217,0.010103,0.099468,0.130015,0.129169,0.183103,...,0.061469,0.0,0.093662,0.03042,0.0,0.036868,0.104682,0.014977,0.01108,0.25
2,0.077101,0.073947,0.074232,0.043767,0.010982,0.026749,0.022682,0.040175,0.040651,0.147759,...,0.025329,0.0,0.117489,0.01153,0.0,0.144733,0.084689,0.024252,0.037072,0.25
3,0.528039,0.525213,0.523103,0.054365,0.22902,0.014951,0.305986,0.527441,0.52083,0.491702,...,0.061481,0.0,0.001226,0.070375,0.0,0.021644,0.256183,0.228389,0.027118,0.25
4,0.171658,0.171466,0.169991,0.070441,0.04125,0.023749,0.073326,0.085757,0.084314,0.123224,...,0.089542,0.0,0.07139,0.022778,0.0,0.097608,0.124548,0.071202,0.032134,0.25


#### > Apply Chi-Squared Feature Selection

In [23]:
# Initialize the chi-squared feature selector
chi2_selector = SelectKBest(score_func=chi2, k=10)  # Selecting top 10 features

# Fit and transform the dataset
X_kbest = chi2_selector.fit_transform(X_normalized, y)

# Inspect the shape of the reduced feature set
print("Shape of reduced features (X_kbest):", X_kbest.shape)

Shape of reduced features (X_kbest): (1091, 10)


#### > Rank and Display Features

In [24]:
# Get scores and feature names
scores = chi2_selector.scores_
feature_names = X.columns

# Pair scores with feature names and rank them
ranked_features = sorted(zip(scores, feature_names), reverse=True)
chi2_ranked_df = pd.DataFrame(ranked_features, columns=['Score', 'Feature'])

# Display the top 10 features
print("Top 10 features based on chi-squared test:")
chi2_ranked_df.head(10)


Top 10 features based on chi-squared test:


Unnamed: 0,Score,Feature
0,,hsa-mir-103b-1
1,,hsa-mir-103b-2
2,,hsa-mir-1183
3,38.356704,hsa-mir-4663
4,5.234286,hsa-mir-1972-1
5,4.676144,hsa-mir-4776-2
6,3.948822,hsa-mir-4735
7,3.811346,hsa-mir-4666b
8,3.291953,hsa-mir-4737
9,3.207911,hsa-mir-4720


#### > Save Results for Comparison

In [26]:
# Save the ranked features to a CSV file in the processed_data folder
chi2_ranked_df.to_csv('../processed_data/chi2_ranked_features.csv', index=False)

# Display the full ranking in the notebook
chi2_ranked_df

Unnamed: 0,Score,Feature
0,,hsa-mir-103b-1
1,,hsa-mir-103b-2
2,,hsa-mir-1183
3,38.356704,hsa-mir-4663
4,5.234286,hsa-mir-1972-1
...,...,...
1877,0.137349,hsa-mir-937
1878,0.109917,hsa-mir-877
1879,0.067944,hsa-mir-92b
1880,0.055037,hsa-mir-99b
