# Cross Validation

This notebook contains the code for running the cross validation on the training data with the optimal set of parameters and features. The application to the validation data is not working code as we are not able to share this data but is there to give an idea of how the workflow works.

In [3]:
import nbimporter
from format_data import FormatMLData
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Suppress ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Logistic Regression Cross Validation Analysis

In [5]:
# Load and preprocess data
ml_data = FormatMLData("../data/ml_data_subset.tsv").return_ml_data()
final_selected_features_logistic_regression = ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PP5', 'WEIGHTED_PM5', 'WEIGHTED_PM4', 'WEIGHTED_PM2']
best_params_lr = {'C': 10, 'l1_ratio': 0.9, 'max_iter': 100, 'penalty': 'elasticnet', 'solver': 'saga'}

X = ml_data.training_data.select(final_selected_features_logistic_regression).to_pandas().values
y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_pandas().values.ravel()

# Define cross-validation and metrics storage
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
metrics = {"auc": [], "precision": [], "recall": [], "f1": []}
models = []

# Train and evaluate models
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = LogisticRegression(**best_params_lr, random_state=fold)
    model.fit(X_train, y_train)
    models.append(model)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Collect metrics
    metrics["auc"].append(roc_auc_score(y_test, y_pred_proba))
    metrics["precision"].append(precision_score(y_test, y_pred))
    metrics["recall"].append(recall_score(y_test, y_pred))
    metrics["f1"].append(f1_score(y_test, y_pred))

# Compute mean and standard deviation for each metric
results = {metric: (np.mean(scores), np.std(scores)) for metric, scores in metrics.items()}

# Print results
print("\nEvaluation Metrics (Mean ± Std):")
for metric, (mean, std) in results.items():
    print(f"{metric.capitalize()}: {mean:.4f} ± {std:.4f}")


Evaluation Metrics (Mean ± Std):
Auc: 0.9988 ± 0.0016
Precision: 0.9921 ± 0.0111
Recall: 0.9807 ± 0.0171
F1: 0.9863 ± 0.0102


## Random Forest Cross Validation Analysis


In [9]:
# Load and preprocess data
ml_data = FormatMLData("../data/ml_data_subset.tsv").return_ml_data()
final_selected_features_random_forest = ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PP5', 'WEIGHTED_PM5', 'WEIGHTED_PM4', 'WEIGHTED_PM2']

best_params_rf = {'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}

X = ml_data.training_data.select(final_selected_features_random_forest).to_pandas().values
y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_pandas().values.ravel()

# Define cross-validation and metrics storage
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
metrics = {"auc": [], "precision": [], "recall": [], "f1": []}
models = []

# Train and evaluate models
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = RandomForestClassifier(**best_params_rf, random_state=fold)
    model.fit(X_train, y_train)
    models.append(model)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Collect metrics
    metrics["auc"].append(roc_auc_score(y_test, y_pred_proba))
    metrics["precision"].append(precision_score(y_test, y_pred))
    metrics["recall"].append(recall_score(y_test, y_pred))
    metrics["f1"].append(f1_score(y_test, y_pred))

# Compute mean and standard deviation for each metric
results = {metric: (np.mean(scores), np.std(scores)) for metric, scores in metrics.items()}

# Print results
print("\nEvaluation Metrics (Mean ± Std):")
for metric, (mean, std) in results.items():
    print(f"{metric.capitalize()}: {mean:.4f} ± {std:.4f}")


Evaluation Metrics (Mean ± Std):
Auc: 0.9989 ± 0.0015
Precision: 0.9903 ± 0.0121
Recall: 0.9847 ± 0.0164
F1: 0.9874 ± 0.0106


In [10]:

# Load and preprocess data
ml_data = FormatMLData("../data/ml_data_subset.tsv").return_ml_data()
final_selected_features_xgboost = ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PP5', 'WEIGHTED_PM5', 'WEIGHTED_PM4', 'WEIGHTED_PM2']

best_params_xgboost = {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 0.8}

X = ml_data.training_data.select(final_selected_features_xgboost).to_pandas().values
y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_pandas().values.ravel()

# Define cross-validation and metrics storage
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
metrics = {"auc": [], "precision": [], "recall": [], "f1": []}
models = []

# Train and evaluate models
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = XGBClassifier(**best_params_xgboost, random_state=fold)
    model.fit(X_train, y_train)
    models.append(model)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Collect metrics
    metrics["auc"].append(roc_auc_score(y_test, y_pred_proba))
    metrics["precision"].append(precision_score(y_test, y_pred))
    metrics["recall"].append(recall_score(y_test, y_pred))
    metrics["f1"].append(f1_score(y_test, y_pred))

# Compute mean and standard deviation for each metric
results = {metric: (np.mean(scores), np.std(scores)) for metric, scores in metrics.items()}

# Print results
print("\nEvaluation Metrics (Mean ± Std):")
for metric, (mean, std) in results.items():
    print(f"{metric.capitalize()}: {mean:.4f} ± {std:.4f}")


Evaluation Metrics (Mean ± Std):
Auc: 0.9987 ± 0.0018
Precision: 0.9905 ± 0.0140
Recall: 0.9826 ± 0.0181
F1: 0.9864 ± 0.0116


## This Code Will *Not* Run Due to Missing Validation Data but Serves as a Workflow Example


In [None]:
# Extract validation data
final_selected_features_random_forest = ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PM2', 'WEIGHTED_BS4', 'WEIGHTED_BP6', 'WEIGHTED_BP4']


X_val = ml_data.testing_data.select(final_selected_features_random_forest).to_pandas().values
y_val = ml_data.testing_data.select(["CAUSATIVE_VARIANT_STATUS"]).to_pandas().values.ravel()

val_metrics = {"auc": [], "precision": [], "recall": [], "f1": []}

# Evaluate each model on the validation data
for model in models:
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    y_val_pred = model.predict(X_val)
    
    # Collect metrics
    val_metrics["auc"].append(roc_auc_score(y_val, y_val_pred_proba))
    val_metrics["precision"].append(precision_score(y_val, y_val_pred))
    val_metrics["recall"].append(recall_score(y_val, y_val_pred))
    val_metrics["f1"].append(f1_score(y_val, y_val_pred))

# Calculate mean and std for each metric
val_results = {metric: (np.mean(scores), np.std(scores)) for metric, scores in val_metrics.items()}

# Print results
print("\nValidation Metrics (Mean ± Std):")
for metric, (mean, std) in val_results.items():
    print(f"{metric.capitalize()}: {mean:.4f} ± {std:.4f}")