In [9]:
import nbimporter
from format_data import FormatMLData
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
ml_data = FormatMLData("../data/ml_data_subset.tsv").return_ml_data()

## Hyperparameter Tuning for Logistic Regression Model

In [5]:
# Hyperparameter tuning on subset
# Note the final_selected_features_logistic_regression differ to those on the full dataset - these features are selected from the subset data
final_selected_features_logistic_regression = ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PP5', 'WEIGHTED_PM5', 'WEIGHTED_PM4', 'WEIGHTED_PM2']

X = ml_data.training_data.select(final_selected_features_logistic_regression)
y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_series().to_numpy().ravel()

# Define parameter grids
param_grid_l1_l2 = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500, 1000]
}

param_grid_elasticnet = {
    'penalty': ['elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['saga'],
    'l1_ratio': [0.1, 0.5, 0.9],
    'max_iter': [100, 200, 500, 1000]
}

combined_param_grid = [param_grid_l1_l2, param_grid_elasticnet]

lr = LogisticRegression(random_state=42)

# Perform Grid Search
lr_grid_search = GridSearchCV(
    estimator=lr,
    param_grid=combined_param_grid,
    cv=10,
    scoring='roc_auc',  
    error_score='raise',
    return_train_score= True,    
)
lr_grid_search.fit(X, y)

print("Best parameters for Logistic Regression:", lr_grid_search.best_params_)
print("Best cross-validated accuracy:", lr_grid_search.best_score_)

# Store train and test scores - look for signs of overfitting
results_df = pd.DataFrame(lr_grid_search.cv_results_)
results_df.to_csv("logistic_regression_grid_search.tsv", sep='\t')



Best parameters for Logistic Regression: {'C': 10, 'l1_ratio': 0.9, 'max_iter': 100, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validated accuracy: 0.9989833584545819


  _data = np.array(data, dtype=dtype, copy=copy,


## Hyperparameter Tuning for Random Forest Classifier Model


In [7]:
# Hyperparameter tuning on subset
# Note the final_selected_features_random_forest differ to those on the full dataset - these features are selected from the subset data

final_selected_features_random_forest =  ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PM2', 'WEIGHTED_BS4', 'WEIGHTED_BP6', 'WEIGHTED_BP4']

X = ml_data.training_data.select(final_selected_features_random_forest)
y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_series().to_numpy().ravel()

# Define parameter grids
rf_param_grid = {
    'n_estimators': [100, 200, 300],  
    'max_depth': [10, 20, 30, None],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'max_features': ['sqrt', 'log2'],  
    'class_weight': [None, 'balanced']  
}

rf = RandomForestClassifier(random_state=42)

rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=10,
    scoring='roc_auc',  
    error_score='raise',
    return_train_score= True,
)

rf_grid_search.fit(X, y)

print("Best parameters for Random Forest:", rf_grid_search.best_params_)
print("Best cross-validated accuracy:", rf_grid_search.best_score_)

# Store train and test scores - look for signs of overfitting
results_df = pd.DataFrame(rf_grid_search.cv_results_)
results_df.to_csv("random_forest_grid_search.tsv", sep='\t')

Best parameters for Random Forest: {'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validated accuracy: 0.9986895936509889


## Hyperparameter Tuning for XGBoost Model


In [10]:
# Hyperparameter tuning on subset
# Note the final_selected_features_xgboost differ to those on the full dataset - these features are selected from the subset data

final_selected_features_xgboost =  ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PVS1', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PP5', 'WEIGHTED_PP3', 'WEIGHTED_PM2']

X = ml_data.training_data.select(final_selected_features_xgboost)
y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_series().to_numpy().ravel()

# Define parameter grids
xgb_param_grid = {
    'n_estimators': [100, 200, 300],  
    'learning_rate': [0.01, 0.1, 0.2],  
    'max_depth': [3, 5, 7, 10],  
    'min_child_weight': [1, 3, 5], 
    'gamma': [0, 0.1, 0.2],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0],  
    'scale_pos_weight': [1]  
}

xgb = XGBClassifier(seed=42)

rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=10,
    scoring='roc_auc',  
    error_score='raise',
    return_train_score= True,
)

xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, cv=10, scoring='roc_auc', return_train_score=True)

xgb_grid_search.fit(X, y)

print("Best parameters for XGBoost Forest:", xgb_grid_search.best_params_)
print("Best cross-validated accuracy:", xgb_grid_search.best_score_)

# Store train and test scores - look for signs of overfitting
results_df = pd.DataFrame(xgb_grid_search.cv_results_)
results_df.to_csv("xgboost_grid_search.tsv", sep='\t')

Best parameters for XGBoost Forest: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 0.8}
Best cross-validated accuracy: 0.9986572949029501
