In [1]:
import nbimporter
from format_data import FormatMLData
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, \
    matthews_corrcoef, cohen_kappa_score, log_loss, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector

In [2]:
ml_data = FormatMLData("../data/ml_data_subset.tsv").return_ml_data()
acmg_weighted_features = [col for col in ml_data.training_data.columns if col.startswith("WEIGHTED")]
acmg_tally_features = [col for col in ml_data.training_data.columns if col.startswith("TALLY_")]
acmg_evidence_features = [col for col in ml_data.training_data.columns if col.startswith("EXOMISER_ACMG_EVIDENCE_")]
core_features = ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE'] 
acmg_feature_groups = {
    "ACMG_PPP_MEAN": ['ACMG_PPP_MEAN'],
    "WEIGHTED": acmg_weighted_features,
    "TALLY": acmg_tally_features,
    "ACMG_EVIDENCE": acmg_evidence_features
}
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(seed=42),
    
}

# Loop through each model and feature group
for model_name, base_model in models.items():
    sfs = SequentialFeatureSelector(base_model, n_features_to_select="auto", direction='forward', scoring='roc_auc', cv=10)
    performance_results = {}
    
    for group_name, acmg_features in acmg_feature_groups.items():
        current_features = core_features + acmg_features
        X = ml_data.training_data.select(current_features)
        y = ml_data.training_data.select(["CAUSATIVE_VARIANT_STATUS"])
        X_train, X_test, y_train, y_test = train_test_split(X, y.to_series().to_numpy().ravel(), test_size=0.2, random_state=42)
        
        # Fit SFS and get selected features
        sfs.fit(X_train, y_train)
        selected_indices = sfs.get_support()
        selected_feature_names = [current_features[i] for i in range(len(current_features)) if selected_indices[i]]
        
        # Subset X_train and X_test to the selected features
        X_train_selected = X_train[:, selected_indices]
        X_test_selected = X_test[:, selected_indices]
        
        # Train the model on the selected features
        base_model.fit(X_train_selected, y_train)
        
        # Predict and evaluate on the test set
        y_pred = base_model.predict(X_test_selected)
        y_pred_proba = base_model.predict_proba(X_test_selected)[:, 1]
        
        # Calculate various performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        kappa = cohen_kappa_score(y_test, y_pred)
        logloss = log_loss(y_test, y_pred_proba)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        # Specificity calculation from confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp)
        
        # Precision-Recall AUC
        precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = auc(recall_vals, precision_vals)
        
        # Store performance metrics
        performance_results[group_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'balanced_accuracy': balanced_accuracy,
            'mcc': mcc,
            'kappa': kappa,
            'logloss': logloss,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc,
            'specificity': specificity,
            'selected_features': selected_feature_names
        }

    # Print out the results for each feature group and model
    print(f"\nResults for {model_name}:")
    for group_name, results in performance_results.items():
        print(f"\nGroup: {group_name}")
        print(f"Accuracy: {results['accuracy']:.4f}")
        print(f"Precision: {results['precision']:.4f}")
        print(f"Recall: {results['recall']:.4f}")
        print(f"F1 Score: {results['f1_score']:.4f}")
        print(f"Balanced Accuracy: {results['balanced_accuracy']:.4f}")
        print(f"Matthews Correlation Coefficient (MCC): {results['mcc']:.4f}")
        print(f"Cohen’s Kappa: {results['kappa']:.4f}")
        print(f"Log Loss: {results['logloss']:.4f}")
        print(f"ROC AUC: {results['roc_auc']:.4f}")
        print(f"Precision-Recall AUC: {results['pr_auc']:.4f}")
        print(f"Specificity: {results['specificity']:.4f}")
        print(f"Selected Features: {results['selected_features']}")


Results for LogisticRegression:

Group: ACMG_PPP_MEAN
Accuracy: 0.9138
Precision: 0.8750
Recall: 0.9754
F1 Score: 0.9225
Balanced Accuracy: 0.9104
Matthews Correlation Coefficient (MCC): 0.8322
Cohen’s Kappa: 0.8260
Log Loss: 0.2254
ROC AUC: 0.9861
Precision-Recall AUC: 0.9876
Specificity: 0.8455
Selected Features: ['EXOMISER_GENE_VARIANT_SCORE']

Group: WEIGHTED
Accuracy: 0.9957
Precision: 0.9919
Recall: 1.0000
F1 Score: 0.9959
Balanced Accuracy: 0.9955
Matthews Correlation Coefficient (MCC): 0.9914
Cohen’s Kappa: 0.9914
Log Loss: 0.0640
ROC AUC: 1.0000
Precision-Recall AUC: 1.0000
Specificity: 0.9909
Selected Features: ['EXOMISER_GENE_PHENO_SCORE', 'EXOMISER_GENE_VARIANT_SCORE', 'WEIGHTED_PS1', 'WEIGHTED_PS2', 'WEIGHTED_PP5', 'WEIGHTED_PM5', 'WEIGHTED_PM4', 'WEIGHTED_PM2']

Group: TALLY
Accuracy: 0.9828
Precision: 0.9683
Recall: 1.0000
F1 Score: 0.9839
Balanced Accuracy: 0.9818
Matthews Correlation Coefficient (MCC): 0.9659
Cohen’s Kappa: 0.9654
Log Loss: 0.0525
ROC AUC: 0.9983
Prec