In [None]:
import nbimporter
from format_data import FormatMLData
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
ml_data = FormatMLData("../data/ml_data_subset.tsv").return_ml_data()

## Hyperparameter Tuning for Logistic Regression Model

In [None]:
# Hyperparameter tuning on subset
# Note the final_selected_features_logistic_regression differ to those on the full dataset - these features are selected from the subset data
final_selected_features_logistic_regression = ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PP5', 'WEIGHTED_PM5', 'WEIGHTED_PM4', 'WEIGHTED_PM2']

X = ml_data.training_data.select(final_selected_features_logistic_regression)
y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_series().to_numpy().ravel()

# Define parameter grids
param_grid_l1_l2 = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500, 1000]
}

param_grid_elasticnet = {
    'penalty': ['elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['saga'],
    'l1_ratio': [0.1, 0.5, 0.9],
    'max_iter': [100, 200, 500, 1000]
}

combined_param_grid = [param_grid_l1_l2, param_grid_elasticnet]

lr = LogisticRegression(random_state=42)

# Perform Grid Search
lr_grid_search = GridSearchCV(
    estimator=lr,
    param_grid=combined_param_grid,
    cv=10,
    scoring='roc_auc',  
    n_jobs=-1,
    error_score='raise',
    return_train_score= True,    
)
lr_grid_search.fit(X, y)

print("Best parameters for Logistic Regression:", lr_grid_search.best_params_)
print("Best cross-validated accuracy:", lr_grid_search.best_score_)

# Store train and test scores - look for signs of overfitting
results_df = pd.DataFrame(lr_grid_search.cv_results_)
results_df.to_csv("logistic_regression_grid_search.tsv", sep='\t')