In [22]:
import pandas as pd
import numpy as np

In [23]:
data_path=r'./dataset/researchDataset/DS07610.csv'
df = pd.read_csv(data_path, delimiter=',', index_col=False)
df = df.drop('Class', axis=1)
df = df.drop('Testability', axis=1)
top_15_predictors = list(df.columns)
top_15_predictors

['CSORD_AvgLineCodeExe',
 'CSLEX_NumberOfConditionalJumpStatements',
 'CSORD_AvgLineCode',
 'CSORD_NumberOfDepends',
 'CSLEX_NumberOfUniqueIdentifiers',
 'CSLEX_NumberOfDots',
 'CSORD_CountDeclInstanceMethod',
 'CSORD_CountDeclMethodPublic',
 'CSORD_NIM',
 'CSORD_AvgStmtDecl',
 'CSORD_CountDeclClassMethod',
 'CSLEX_NumberOfNewStatements',
 'CSLEX_NumberOfReturnAndPrintStatements',
 'CSORD_NumberOfClassConstructors',
 'PK_CountDeclClassMethod']

In [24]:
data_path=r'./dataset/researchDataset/DS07012.csv'
data = pd.read_csv(data_path, delimiter=',', index_col=False)
df = data.drop('Class', axis=1)
df.head()

Unnamed: 0,PK_CountLineCode,PK_CountLineCodeDecl,PK_CountLineCodeExe,PK_AvgLineCode,PK_AvgLineCodeDecl,PK_AvgLineCodeExe,PK_MaxLineCode,PK_MaxLineCodeDecl,PK_MaxLineCodeExe,PK_MinLineCode,...,CSORD_SumKnots,CSORD_MinKnots,CSORD_MaxKnots,CSORD_AvgKnots,CSORD_SDKnots,CSORD_NumberOfClassConstructors,CSORD_NumberOfDepends,CSORD_NumberOfDependsBy,CSORD_NumberOfMethods,Testability
0,0,0,0,0,0.0,0.0,0,0,0,0,...,0,0,0,0.0,0.0,0,5,1,1,0.34956
1,0,0,0,0,0.0,0.0,0,0,0,0,...,0,0,0,0.0,0.0,0,3,1,1,0.312267
2,0,0,0,0,0.0,0.0,0,0,0,0,...,1,0,1,0.090909,0.28748,2,0,6,11,0.790031
3,0,0,0,0,0.0,0.0,0,0,0,0,...,0,0,0,0.0,0.0,3,0,4,8,0.761905
4,0,0,0,0,0.0,0.0,0,0,0,0,...,0,0,0,0.0,0.0,2,1,10,8,0.61184


In [25]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
X.shape

(16165, 253)

In [26]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# Create a pipeline with SelectKBest and LinearRegression as the base estimator
feature_selector = SelectKBest(score_func=f_regression)
regressor = LinearRegression()
pipeline = make_pipeline(feature_selector, regressor)

# Define the k values to explore (adjust the range based on your dataset size)
param_grid = {'selectkbest__k': range(5, 101, 5)}  # 'selectkbest__k' is the parameter name for k in the pipeline

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Get the optimal k value and the corresponding pipeline
optimal_k = grid_search.best_params_['selectkbest__k']
best_pipeline = grid_search.best_estimator_

# Transform the data to the selected features
X_new = best_pipeline.named_steps['selectkbest'].transform(X)

# Convert the selected features back to a DataFrame
selected_feature_indices = best_pipeline.named_steps['selectkbest'].get_support(indices=True)
selected_features_df = pd.DataFrame(X_new, columns=[f'Feature_{i+1}' for i in selected_feature_indices])

# Concatenate the selected features with the target variable (y) and save to a CSV file
selected_features_df['Target'] = y
selected_features_df.to_csv('selected_features_dataset.csv', index=False)

In [27]:
feature_names = best_pipeline.named_steps['selectkbest'].get_feature_names_out()
selected_k_best = list(feature_names)
features_kept = ['Class'] + selected_k_best + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_KBest_CV.csv'
feature_selected_data.to_csv(new_data_path, index=False)

In [28]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# Create a pipeline with RFE and LinearRegression as the base estimator
estimator = LinearRegression()
feature_selector = RFE(estimator=estimator)
pipeline = make_pipeline(feature_selector, estimator)

# Define the number of features to explore (adjust the range based on your dataset size)
param_grid = {'rfe__n_features_to_select': range(5, 101, 5)}  # 'rfe__n_features_to_select' is the parameter name for the number of features to select in the pipeline

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Get the optimal number of features and the corresponding pipeline
optimal_n_features = grid_search.best_params_['rfe__n_features_to_select']
best_pipeline = grid_search.best_estimator_

# Transform the data to the selected features
X_new = best_pipeline.named_steps['rfe'].transform(X)

# Convert the selected features back to a DataFrame
selected_features_df = pd.DataFrame(X_new, columns=[f'Feature_{i+1}' for i in range(optimal_n_features)])

In [29]:
feature_names = best_pipeline.named_steps['rfe'].get_feature_names_out()
selected_k_best = list(feature_names)
features_kept = ['Class'] + selected_k_best + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_RFE_CV.csv'
feature_selected_data.to_csv(new_data_path, index=False)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X.shape

(16165, 253)

L1 regularization (Lasso) with Cross-validation

In [18]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold

# Define the range of alpha values to explore (e.g., from 0.01 to 100)
alpha_values = np.logspace(-2.5, 2, num=30)
# Create a k-fold cross-validation object (k=5 for example)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize LassoCV with the range of alpha values and use k-fold cross-validation
lasso_cv = LassoCV(alphas=alpha_values, cv=kf)
# Fit LassoCV to the data
lasso_cv.fit(X_train_scaled, y_train)

# Get the optimal alpha value that yielded the best performance during cross-validation
optimal_alpha = lasso_cv.alpha_
print("Optimal alpha:", optimal_alpha)

# Get the selected feature indices based on the optimal alpha
selected_feature_indices = np.where(lasso_cv.coef_ != 0)[0]
# Get the selected feature names
selected_feature_names = X.columns[selected_feature_indices]

Optimal alpha: 0.0031622776601683794


In [19]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=optimal_alpha)  # used optimal choice from LassoCV
lasso.fit(X_train_scaled, y_train)
selected_feature_indices = np.where(lasso.coef_ != 0)[0]
selected_feature_names = X.columns[selected_feature_indices]
len(selected_feature_names)

45

In [20]:
common_predictors = [x for x in top_15_predictors if x in selected_feature_names]
common_predictors

['CSORD_NumberOfDepends',
 'CSLEX_NumberOfUniqueIdentifiers',
 'CSLEX_NumberOfDots',
 'CSORD_CountDeclInstanceMethod',
 'CSORD_NIM',
 'CSORD_CountDeclClassMethod',
 'CSLEX_NumberOfNewStatements',
 'CSORD_NumberOfClassConstructors',
 'PK_CountDeclClassMethod']

LassoCV with an optimized alpha ~0.03 produced a subset with 45 features selected. 9 of the top 15 predictors are in this subset.

In [16]:
selected_lasso_cv = list(selected_feature_names)
features_kept = ['Class'] + selected_lasso_cv + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_LassoCV_k45.csv'
feature_selected_data.to_csv(new_data_path, index=False)

L1-based feature selection with Lasso and Cross-Validation.
LassoCV runs Lasso/L1 regularization, a univariate selection method, with cross-validation to select the most important features, tuning the regularization strength alpha.

LassoCV is useful when you have many features and suspect that many of them are irrelevant or redundant. Automatically selects a subset of the most relevant features by setting the coefficients of less important features to zero.

In [23]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Recursive Feature Elimination (RFE) with linear SVR

In [38]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Create a base model for RFE (SVR)
model = SVR(kernel='linear')

# Create RFE object with the model and the desired number of features to select
n_features_to_select = 45  # Specify the number of top features to select
rfe = RFE(model, n_features_to_select=45)

# Fit RFE to the data
rfe.fit(X_train_scaled, y_train)

# Get the mask of selected features (True for selected features, False for others)
selected_feature_mask = rfe.support_

# Get the names of the selected features
selected_feature_names = X.columns[selected_feature_mask]

In [27]:
selected_rfe = list(selected_feature_names)
features_kept = ['Class'] + selected_rfe + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_RFE_k45.csv'
feature_selected_data.to_csv(new_data_path, index=False)

Recursive Feature Elimination (RFE) with rbf SVR

In [18]:
from sklearn.decomposition import PCA

# Save the column names for later use
column_names = X.columns

# Create a PCA object
n_components = 45  # Specify the number of top components to retain
pca = PCA(n_components=n_components)

# Fit and transform the data using PCA
X_pca = pca.fit_transform(X_scaled)

# Get the indices of the features with the highest loadings in each component
top_feature_indices = np.abs(pca.components_).argmax(axis=1)

# Get the names of the selected features
selected_feature_names = column_names[top_feature_indices]

print(X_pca.shape)

(16165, 45)


In [None]:
selected_pca = list(selected_feature_names)
features_kept = ['Class'] + selected_pca + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_PCA_k45.csv'
feature_selected_data.to_csv(new_data_path, index=False)

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_threshold = VarianceThreshold(threshold=0.1)

X_new = var_threshold.fit_transform(X)
feature_names = var_threshold.get_feature_names_out()
feature_names.shape

Features all have significant variance, a threshold of 0.2 only eliminated 5 features, 0.1 variance eliminated 3 features.

In [29]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

k_best = SelectKBest(f_regression, k=45)

X_new = k_best.fit_transform(X, y)
feature_names = k_best.get_feature_names_out()
select_k_best_predictors = list(feature_names)
common_predictors = [x for x in top_15_predictors if x in select_k_best_predictors]
common_predictors

['CSORD_AvgLineCodeExe', 'CSORD_AvgLineCode', 'CSORD_AvgStmtDecl']

SelectKBest is a univariate feature selection method, evaluates each feature independently without considering the relationship between features.
Used f_regression to rank the features, it then selects the top K features with the highest scores

Standard regression SelectKBest with k=15 found 3 of the top 15 predictors found by paper.

In [None]:
selected_k_best = list(feature_names)
features_kept = ['Class'] + selected_k_best + ['Testability']
feature_selected_data = data.filter(features_kept)

new_data_path=r'./dataset/newDataset/DS_K45.csv'
feature_selected_data.to_csv(new_data_path, index=False)

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR

estimator = SVR(kernel="linear")

rfecv = RFECV(
    estimator=estimator,
    step=5,
    min_features_to_select=15,
    cv=3
)
rfecv.fit(X_new, y)

print(f"Optimal number of features: {rfecv.n_features_}")

RFECV Takes extremely long to run even with smaller dataset with 40 features.