Group 33, Florida Atlantic University Lasso Feature Selection for miRNA Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
# Load Dataset
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')
assert not labeled_miRNA_data.isna().any().any(), "NaN values detected in input data"

print("Dataset shape:", labeled_miRNA_data.shape)

Dataset shape: (1091, 1883)


In [3]:
# Separate Features and Target
X = labeled_miRNA_data.drop(columns=['stage', 'subtype'])  # Drop 'stage' and 'subtype'
y = labeled_miRNA_data['stage']                           # Use 'stage' as the target variable

print("Shape of features (X):", X.shape)
print("Shape of target labels (y):", y.shape)


Shape of features (X): (1091, 1881)
Shape of target labels (y): (1091,)


In [4]:
# Normalize Features
scaler = StandardScaler()
X_normalized = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns
)
assert not X_normalized.isna().any().any(), "NaN values detected after normalization"
print("Features normalized successfully.")

Features normalized successfully.


In [5]:
# Tune Alpha Using Grid Search
param_grid = {'alpha': np.logspace(-6, -2, 50)}
lasso = Lasso(random_state=42, max_iter=10000)
grid_search = GridSearchCV(
    lasso, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1
)
grid_search.fit(X_normalized, y)

In [6]:
# Get the Best Alpha
best_alpha = grid_search.best_params_['alpha']
print(f"Best alpha found: {best_alpha}")

Best alpha found: 0.01


In [7]:
# Fit Lasso with the Best Alpha
lasso = Lasso(alpha=best_alpha, random_state=42, max_iter=10000)
lasso.fit(X_normalized, y)

In [8]:
# Extract Important Features
lasso_coefficients = np.abs(lasso.coef_)
lasso_selected_features = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso.coef_
})
lasso_selected_features = lasso_selected_features[lasso_selected_features['Coefficient'] != 0]

print("Lasso Selected Features:")
print(lasso_selected_features.sort_values(by='Coefficient', ascending=False).head(10))


Lasso Selected Features:
            Feature  Coefficient
227    hsa-mir-1825     0.083514
30     hsa-mir-1181     0.079568
1806   hsa-mir-8066     0.069419
233     hsa-mir-187     0.069182
231     hsa-mir-185     0.067875
1235  hsa-mir-548ak     0.061115
1584   hsa-mir-6766     0.060375
1824   hsa-mir-8082     0.060024
1754    hsa-mir-760     0.059885
636    hsa-mir-3922     0.058777


In [9]:
# Save Lasso Results
lasso_selected_features.to_csv('../results/lasso_results.csv', index=False)
print("Lasso results saved to '../results/lasso_results.csv'.")

Lasso results saved to '../results/lasso_results.csv'.
