In [None]:
import pandas as pd
import numpy as np

In [None]:
data_path=r'./dataset/researchDataset/DS07012.csv'
data = pd.read_csv(data_path, delimiter=',', index_col=False)
df = data.drop('Class', axis=1)
df.head()

In [None]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
X.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold

# Define the range of alpha values to explore (e.g., from 0.01 to 100)
alpha_values = np.logspace(-2.5, 2, num=30)
# Create a k-fold cross-validation object (k=5 for example)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize LassoCV with the range of alpha values and use k-fold cross-validation
lasso_cv = LassoCV(alphas=alpha_values, cv=kf)
# Fit LassoCV to the data
lasso_cv.fit(X_train_scaled, y_train)

# Get the optimal alpha value that yielded the best performance during cross-validation
optimal_alpha = lasso_cv.alpha_
print("Optimal alpha:", optimal_alpha)

# Get the selected feature indices based on the optimal alpha
selected_feature_indices = np.where(lasso_cv.coef_ != 0)[0]
# Get the selected feature names
selected_feature_names = X.columns[selected_feature_indices]

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.0031622776601683794)  # used optimal choice from LassoCV
lasso.fit(X_train_scaled, y_train)
selected_feature_indices = np.where(lasso.coef_ != 0)[0]
selected_feature_names = X.columns[selected_feature_indices]
len(selected_feature_names)

In [None]:
selected_lasso_cv = list(selected_feature_names)
features_kept = ['Class'] + selected_lasso_cv + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_LassoCV_k45.csv'
feature_selected_data.to_csv(new_data_path, index=False)

L1-based feature selection with Lasso and Cross-Validation.
LassoCV runs Lasso/L1 regularization, a univariate selection method, with cross-validation to select the most important features, tuning the regularization strength alpha.

LassoCV is useful when you have many features and suspect that many of them are irrelevant or redundant. Automatically selects a subset of the most relevant features by setting the coefficients of less important features to zero.

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA

# Save the column names for later use
column_names = X.columns

# Create a PCA object
n_components = 45  # Specify the number of top components to retain
pca = PCA(n_components=n_components)

# Fit and transform the data using PCA
X_pca = pca.fit_transform(X_scaled)

# Get the indices of the features with the highest loadings in each component
top_feature_indices = np.abs(pca.components_).argmax(axis=1)

# Get the names of the selected features
selected_feature_names = column_names[top_feature_indices]

print(selected_feature_names)

In [None]:
selected_pca = list(selected_feature_names)
features_kept = ['Class'] + selected_pca + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_PCA_k45.csv'
feature_selected_data.to_csv(new_data_path, index=False)

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_threshold = VarianceThreshold(threshold=0.1)

X_new = var_threshold.fit_transform(X)
feature_names = var_threshold.get_feature_names_out()
feature_names.shape

Features all have significant variance, a threshold of 0.2 only eliminated 5 features, 0.1 variance eliminated 3 features.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

k_best = SelectKBest(f_regression, k=45)

X_new = k_best.fit_transform(X, y)
feature_names = k_best.get_feature_names_out()
feature_names

SelectKBest is a univariate feature selection method, evaluates each feature independently without considering the relationship between features.
Used f_regression to rank the features, it then selects the top K features with the highest scores

In [None]:
selected_k_best = list(feature_names)
features_kept = ['Class'] + selected_k_best + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_K45.csv'
feature_selected_data.to_csv(new_data_path, index=False)

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR

estimator = SVR(kernel="linear")

rfecv = RFECV(
    estimator=estimator,
    step=5,
    min_features_to_select=15,
    cv=3
)
rfecv.fit(X_new, y)

print(f"Optimal number of features: {rfecv.n_features_}")

RFECV Takes extremely long to run even with smaller dataset with 40 features.