In [1]:
import pandas as pd
import numpy as np

# Simulation

In [123]:
# Set random seed for reproducibility
np.random.seed(42)

n_rows = 2000
n_cols = 100

In [124]:
from generate_simulation_data import generate_simulation_data
df = generate_simulation_data(num_rows=n_rows, num_numeric_columns=n_cols)

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Splitting data into features (X) and target (Y)
X = df.drop(columns=['Y'])
Y = df['Y']

# Splitting data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

In [126]:
from sklearn.linear_model import LogisticRegression

# Fitting Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, Y_train)

# Predict probabilities on the test set
Y_pred = log_reg.predict_proba(X_test)[:, 1]
Y_pred_binary = (Y_pred > 0.5).astype(int)  # Converting predicted probabilities to binary

accuracy = accuracy_score(Y_test, Y_pred_binary)
print("Accuracy:", accuracy)

# Calculate AUC score
auc_score_log_reg = roc_auc_score(Y_test, Y_pred)
print("Logistic Regression AUC Score:", auc_score_log_reg)

Accuracy: 0.961
Logistic Regression AUC Score: 0.9937919543728673


In [134]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Fitting Lasso regression
lasso = Lasso(alpha=0.011)  # You can adjust the regularization strength (alpha) as needed
lasso.fit(X_train, Y_train)

# Evaluating performance on the test set
Y_pred = lasso.predict(X_test)
Y_pred_binary = (Y_pred > 0.5).astype(int)  # Converting predicted probabilities to binary

accuracy = accuracy_score(Y_test, Y_pred_binary)
print("Accuracy:", accuracy)

# Calculate AUC score
auc_score = roc_auc_score(Y_test, Y_pred)
print("AUC Score:", auc_score)


# Getting the list of selected features
selected_features = X.columns[lasso.coef_ != 0]
print(f"Selected {len(selected_features)} Features:", selected_features)

Accuracy: 0.975
AUC Score: 0.9983578718018553
Selected 24 Features: Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X8', 'X12', 'X15', 'X16', 'X31', 'X51',
       'X62', 'X67', 'X70', 'X77', 'X78', 'X83', 'X90', 'X92', 'X95', 'X96',
       'X72_correlated_2', 'X28_correlated_2', 'X55_correlated_1'],
      dtype='object')


In [146]:
from sklearn.linear_model import ElasticNetCV

# Fit Elastic Net model with cross-validation to select features
enet = ElasticNetCV(cv=10, random_state=42, l1_ratio=0.15)
enet.fit(X_train, Y_train)

# Predict on the test set
Y_pred = enet.predict(X_test)
Y_pred_binary = (Y_pred > 0.5).astype(int)  # Converting predicted probabilities to binary

accuracy = accuracy_score(Y_test, Y_pred_binary)
print("Accuracy:", accuracy)

# Calculate AUC score
auc_score = roc_auc_score(Y_test, Y_pred)
print("AUC Score:", auc_score)

# Get selected features
selected_features = X.columns[enet.coef_ != 0]
print(f"Selected {len(selected_features)} Features:")
print(selected_features)
select_features_simplified = list(set([f.split("_")[0] for f in list(selected_features)]))
successful_selections = [f for f in select_features_simplified if int(f.split("X")[-1]) <= int(n_cols/4)]
print(f"{len(successful_selections)} features successfully captured")

Accuracy: 0.976
AUC Score: 0.9983218250853106
Selected 23 Features:
Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X8', 'X12', 'X15', 'X16', 'X31', 'X51',
       'X62', 'X67', 'X70', 'X74', 'X78', 'X83', 'X90', 'X92', 'X95',
       'X72_correlated_2', 'X28_correlated_2', 'X55_correlated_1'],
      dtype='object')
9 features successfully captured


In [143]:
from sklearn.ensemble import RandomForestClassifier

# Fitting Random Forest
random_forest = RandomForestClassifier(n_estimators=int(n_cols/4), random_state=42)
random_forest.fit(X_train, Y_train)

# Predictions on the test set
Y_pred_binary = random_forest.predict(X_test)
Y_pred = random_forest.predict_proba(X_test)[:, 1]

# Calculate accuracy
accuracy_rf = accuracy_score(Y_test, Y_pred_binary)
print("Random Forest Accuracy:", accuracy_rf)

# Calculate AUC score
auc_score_rf = roc_auc_score(Y_test, Y_pred)
print("Random Forest AUC Score:", auc_score_rf)

# Get feature importances
feature_importances = random_forest.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(feature_importances)[::-1]

# Get the top features
top_features = X.columns[sorted_indices][:int(n_cols/4)]
print(f"Top {len(top_features)} Features Selected by Random Forest:", top_features)

Random Forest Accuracy: 0.83
Random Forest AUC Score: 0.9192673705121838
Top 25 Features Selected by Random Forest: Index(['X1', 'X4', 'X3', 'X41', 'X5', 'X20', 'X95', 'X28_correlated_2', 'X51',
       'X24', 'X40', 'X87', 'X100', 'X50', 'X28_correlated_4', 'X84', 'X27',
       'X42_correlated_1', 'X25', 'X67', 'X99', 'X10', 'X48', 'X60', 'X57'],
      dtype='object')


In [145]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create an SVM classifier instance
svm_classifier = SVC(kernel='linear', probability=True)

# Create the RFE object and specify the SVM classifier and the number of features to select
rfe = RFE(estimator=svm_classifier, n_features_to_select=int(n_cols/4), step=1)

# Fit RFE to the training data
rfe.fit(X_train, Y_train)

# Transform the training data to include only the selected features
X_train_selected = rfe.transform(X_train)

# Fit SVM classifier to the transformed training data
svm_classifier.fit(X_train_selected, Y_train)

# Transform the test data to include only the selected features
X_test_selected = rfe.transform(X_test)

# Predict the target variable for the test data
Y_pred_svm = svm_classifier.predict(X_test_selected)
Y_prob_svm = svm_classifier.predict_proba(X_test_selected)[:, 1]

# Calculate accuracy
accuracy_svm = accuracy_score(Y_test, Y_pred_svm)
print("SVM with RFE Accuracy:", accuracy_svm)

# Calculate AUC score
auc_score_svm = roc_auc_score(Y_test, Y_prob_svm)
print("SVM AUC Score:", auc_score_svm)

# Get the selected features
selected_features_indices = rfe.get_support(indices=True)
selected_features = X_train.columns[selected_features_indices]
print(f"Selected {len(selected_features)} Features:", selected_features)
select_features_simplified = list(set([f.split("_")[0] for f in list(selected_features)]))
successful_selections = [f for f in select_features_simplified if int(f.split("X")[-1]) <= int(n_cols/4)]
print(f"{len(successful_selections)} features successfully captured")

SVM with RFE Accuracy: 0.976
SVM AUC Score: 0.9982016693634951
Selected 25 Features: Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X15', 'X39', 'X69', 'X74', 'X80', 'X97',
       'X99', 'X85_correlated_1', 'X85_correlated_2', 'X13_correlated_3',
       'X13_correlated_4', 'X56_correlated_3', 'X56_correlated_4',
       'X14_correlated_1', 'X14_correlated_3', 'X28_correlated_2',
       'X28_correlated_4', 'X89_correlated_1', 'X89_correlated_3',
       'X55_correlated_1'],
      dtype='object')
8 features successfully captured


In [131]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score

# Create base SVM classifier instance
svm_base = SVC(kernel='linear', probability=True)  # Probability=True for calculating probabilities for AUC

# Create ensemble SVM classifier with Bagging
ensemble_svm = BaggingClassifier(base_estimator=svm_base, n_estimators=int(n_cols/4), random_state=42)

# Fit the ensemble classifier to the training data
ensemble_svm.fit(X_train, Y_train)

# Predictions on the test set
Y_pred_ensemble_svm = ensemble_svm.predict(X_test)
Y_prob_ensemble_svm = ensemble_svm.predict_proba(X_test)[:, 1]

# Calculate accuracy
accuracy_ensemble_svm = accuracy_score(Y_test, Y_pred_ensemble_svm)
print("Ensemble SVM Accuracy:", accuracy_ensemble_svm)

# Calculate AUC score
auc_score_ensemble_svm = roc_auc_score(Y_test, Y_prob_ensemble_svm)
print("Ensemble SVM AUC Score:", auc_score_ensemble_svm)

# Get the selected features
selected_features = ensemble_svm.estimators_features_
print(f"Selected {len(selected_features)} Features by Ensemble SVM")

Ensemble SVM Accuracy: 0.958
Ensemble SVM AUC Score: 0.9934875598776014
Selected 25 Features by Ensemble SVM


# Real data

In [None]:
df = pd.read_csv('merged_baseline_data.csv')
outcome = pd.read_csv('abcd_cbcls01.txt', sep="\t")
cb = pd.read_csv('codebook.csv')
df.head()