# **ClinVar SNV ROC & AUC Generator**
#### Author: Daniel Brock
#### Date: 2023-07-06
#### Purpose: to re-add the "pathogenicity rankings (PLP, BLB)" and other columns provided in ClinVar's new variant annotated output from annovar and calculate ROC and AUC

## **Importing Required Packages**

In [1]:
# Importing required packages
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Working directory
cwd = os.getcwd()
print(cwd)

c:\Users\TooFastDan\OneDrive - Baylor College of Medicine\BCM\Projects\Autosomal Dominant Predictor of IRDs\manuscript\GitHub


## **Defining Important Functions for ROC-AUC Analysis**

In [2]:
def roc_auc(df, model, truth, disp_results):
    """
    Function to:
    - filter NaN values and calculate percent retained
    - calculate ROC parametrics: false positive rate (fpr), true positive rate (tpr), thresholds
    - Area Under the Curve (auc_score)
    
    Inputs:
    df: autosomal recessive or dominant dataframes
    model: column name of the predictor model in the df
    truth: column name for the ground truth (PLP vs BLB)
    disp_results = True/False on whether to show results of model or not
    
    Outputs:
    fpr: false positive rate
    tpr: true positive rate
    thresholds
    auc_score
    percent of SNVs retained after dropping NaN values
    """
    # Filtering out NaNs and rows with characters in a specified column (model) and calculating percent retained
    df_filt = df[~df[model].isna()]
    percent_retained = (df_filt.shape[0] / df.shape[0]) * 100
    
    try:
        # Calulating false positive rate (fpr), true positive rate (tpr), thresholds, and AUC scores from fpr + tpr
        model_scores = df_filt[model].astype(float)
        fpr, tpr, threshold = roc_curve(y_true=df_filt[truth], y_score=model_scores)
        
        # Calulating AUC scores
        auc_score = auc(fpr, tpr)
        
        # Print and returning results
        if disp_results is True:
            print("{0} AUC = {1} while using {2}% of the values.".format(model, np.round(auc_score, 5), np.round(percent_retained, 3)))
        if disp_results is False:
            pass
        return fpr, tpr, threshold, auc_score, percent_retained
    except:
        print("{} failed due to mixed datatypes".format(model))

In [3]:
def roc_auc_looper(df_input, disp_results):
    """
    Function to loop over 39 predictive models and return ROC and AUC dataframes consisting of fpr, tpr, thresholds, and AUCs.
    Dependent on the "roc_auc" function.
    
    inputs:
    - df_input = dataframe from annovar output with filtered genes of interest, model results, and a "y_test" truth column
    - disp_results = True/False on whether to show results of model or not
    
    outputs: 
    - auc_df
    - roc_df
    """
    # List of models to calculate ROC & AUC values
    models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
              "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
              "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
              "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
              "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
              "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
              "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
              "phastCons30way_mammalian", "SiPhy_29way_logOdds"]
    
    try:
        # Looping and creating lists/dfs to append results to
        auc_data = []
        roc_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
        for m in models:
            # Calulating parametrics for each prediction model
            fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=df_input, model=m, truth="y_test", disp_results=disp_results)

            # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
            auc_data.append((m, auc_score, percent_retained))

            # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
            df_temp = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
            roc_df = pd.concat([roc_df, df_temp], axis=0)

        # Making the AUC dataframe from list of tuples
        auc_df = pd.DataFrame(auc_data, columns=["Model", "AUC", "Percent_Retained"])
        auc_df = auc_df.sort_values("AUC", ascending=False)

        # Displaying and returning results results
        print("\nAUC df shape: {}".format(auc_df.shape))
        display(auc_df.head())
        print("\nROC df shape: {}".format(roc_df.shape))
        display(roc_df.head())
        return auc_df, roc_df
    except:
        print("Error: Bad input")

## **Importing the filtered clinvar dataset (data after 11/21/2020)**
### (This was the input to ANNOVAR)

In [4]:
# Importing the new testing dataset with pathogenicity ranks and dates filtered after MutScore paper (11-21-2020 - from ClinVar_VCF_parser)
clinvar_avi = pd.read_table(cwd+"/annovar_files/clinvar_filt_retnet.avinput", header=None,
                           names=['Chr', 'Start', 'End', 'Ref', 'Alt', 'QUAL', 'FILTER', 'INFO', 'ID', 'ALLELEID', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 
                                  'CLNREVSTAT', 'CLNVC', 'CLNVCSO', 'GENEINFO', 'Gene', 'Gene_ID', 'MC', 'ORIGIN', 'CLNSIG', 'PATHOGENICITY', 'y_test'])

# Displaying info
display(clinvar_avi.shape)
display(clinvar_avi.head())

(62696, 24)

Unnamed: 0,Chr,Start,End,Ref,Alt,QUAL,FILTER,INFO,ID,ALLELEID,...,CLNVC,CLNVCSO,GENEINFO,Gene,Gene_ID,MC,ORIGIN,CLNSIG,PATHOGENICITY,y_test
0,chr10,71142347,71142347,C,A,.,.,"ALLELEID=2671427;CLNDISDB=MONDO:MONDO:0032807,...",2506469,2671427,...,single_nucleotide_variant,SO:0001483,HK1:3098,HK1,3098,SO:0001583|missense_variant,,Likely_pathogenic,PLP,1
1,chr20,10654126,10654126,A,C,.,.,"ALLELEID=2671423;CLNDISDB=MONDO:MONDO:0016862,...",2506465,2671423,...,single_nucleotide_variant,SO:0001483,JAG1:182,JAG1,182,SO:0001583|missense_variant,,Likely_pathogenic,PLP,1
2,chr6,65300662,65300662,T,A,.,.,ALLELEID=2671350;CLNDISDB=Human_Phenotype_Onto...,2506396,2671350,...,single_nucleotide_variant,SO:0001483,EYS:346007,EYS,346007,SO:0001587|nonsense,,Pathogenic,PLP,1
3,chr4,39255536,39255536,C,T,.,.,"ALLELEID=2671345;CLNDISDB=MONDO:MONDO:0013719,...",2506391,2671345,...,single_nucleotide_variant,SO:0001483,WDR19:57728,WDR19,57728,SO:0001587|nonsense,,Likely_pathogenic,PLP,1
4,chr12,48380959,48380959,C,T,.,.,"ALLELEID=2671226;CLNDISDB=MONDO:MONDO:0008702,...",2506270,2671226,...,single_nucleotide_variant,SO:0001483,COL2A1:1280,COL2A1,1280,SO:0001583|missense_variant,,Likely_pathogenic,PLP,1


## **Importing annotated clinvar retnet file after being run in ANNOVAR**

In [5]:
# Importing the annotated csv file from annovar (run in linux)
anno = pd.read_csv(cwd+"/annovar_files/clinvar_filt_retnet_annotated.hg19_multianno.csv", na_values=".")
display(anno.shape)
display(anno.head())

  anno = pd.read_csv(cwd+"/annovar_files/clinvar_filt_retnet_annotated.hg19_multianno.csv", na_values=".")


(62696, 121)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,Interpro_domain,GTEx_V8_gene,GTEx_V8_tissue,MutScore
0,chr10,71142347,71142347,C,A,exonic,HK1,,nonsynonymous SNV,"HK1:NM_001322367:exon9:c.C1274A:p.T425K,HK1:NM...",...,1.0,0.716,0.826,0.344,20.152,0.981,Hexokinase\x2c C-terminal;Hexokinase\x2c C-ter...,,,0.734
1,chr20,10654126,10654126,A,C,exonic,JAG1,,nonsynonymous SNV,JAG1:NM_000214:exon1:c.T53G:p.L18R,...,0.998,0.413,1.0,0.863,12.448,0.55,,,,0.7
2,chr6,65300662,65300662,T,A,exonic,EYS,,stopgain,"EYS:NM_001142800:exon26:c.A5098T:p.K1700X,EYS:...",...,0.014,0.19,0.998,0.659,5.923,0.183,.;.,,,
3,chr4,39255536,39255536,C,T,exonic,WDR19,,stopgain,"WDR19:NM_001317924:exon25:c.C2407T:p.Q803X,WDR...",...,0.997,0.402,0.986,0.501,14.372,0.664,,,,
4,chr12,48380959,48380959,C,T,exonic,COL2A1,,nonsynonymous SNV,"COL2A1:NM_033150:exon20:c.G1060A:p.G354S,COL2A...",...,1.0,0.716,0.967,0.444,16.928,0.861,.;.,,,0.993


## **Merging the two datasets to include pathogenicity scoring**

In [6]:
anno_clinvar = pd.merge(clinvar_avi, anno, on=["Chr", "Start", "End", "Ref", "Alt"]) #merging
anno_clinvar = anno_clinvar.drop_duplicates() #dropping a small number of duplicates
display(anno_clinvar.shape)
display(anno_clinvar.head())

(62693, 140)

Unnamed: 0,Chr,Start,End,Ref,Alt,QUAL,FILTER,INFO,ID,ALLELEID,...,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,Interpro_domain,GTEx_V8_gene,GTEx_V8_tissue,MutScore
0,chr10,71142347,71142347,C,A,.,.,"ALLELEID=2671427;CLNDISDB=MONDO:MONDO:0032807,...",2506469,2671427,...,1.0,0.716,0.826,0.344,20.152,0.981,Hexokinase\x2c C-terminal;Hexokinase\x2c C-ter...,,,0.734
1,chr20,10654126,10654126,A,C,.,.,"ALLELEID=2671423;CLNDISDB=MONDO:MONDO:0016862,...",2506465,2671423,...,0.998,0.413,1.0,0.863,12.448,0.55,,,,0.7
2,chr6,65300662,65300662,T,A,.,.,ALLELEID=2671350;CLNDISDB=Human_Phenotype_Onto...,2506396,2671350,...,0.014,0.19,0.998,0.659,5.923,0.183,.;.,,,
3,chr4,39255536,39255536,C,T,.,.,"ALLELEID=2671345;CLNDISDB=MONDO:MONDO:0013719,...",2506391,2671345,...,0.997,0.402,0.986,0.501,14.372,0.664,,,,
4,chr12,48380959,48380959,C,T,.,.,"ALLELEID=2671226;CLNDISDB=MONDO:MONDO:0008702,...",2506270,2671226,...,1.0,0.716,0.967,0.444,16.928,0.861,.;.,,,0.993


In [12]:
# Optional export of the merged df
#anno_clinvar.to_csv(cwd+"/annovar_files/clinvar_filt_retnet_annotated_PLP-BLB.csv", index=False)

## **Filtering for only nonsynonomous SNVs**

In [7]:
anno_clinvar_filt = anno_clinvar[anno_clinvar["ExonicFunc.refGene"]=="nonsynonymous SNV"]
ird_total_variants = anno_clinvar.shape[0]
ird_nonsyn_variants = anno_clinvar_filt.shape[0]
percent_nonsyn_IRD = np.round((ird_nonsyn_variants / ird_total_variants) * 100, decimals=2)
print("Shape of ClinVar variants filtered for nonsynonomous variants: {0}\nShape of total ClinVar IRD variants: {1}\n{2}% nonsynonomous SNV representation".format(
    str(anno_clinvar_filt.shape), str(anno_clinvar.shape), str(percent_nonsyn_IRD)))

Shape of ClinVar variants filtered for nonsynonomous variants: (3322, 140)
Shape of total ClinVar IRD variants: (62693, 140)
5.3% nonsynonomous SNV representation


## **Separating filtered, nonsynonomous IRD genes based on inheritance pattern defined in RetNet**

In [8]:
# Importing RetNet Genes
retnet = pd.read_excel(cwd+"/RetNet/RetNet_AD-AR_FINALIZED.xlsx", sheet_name="gene_info")
retnet.rename(columns={"gene": "Gene.refGene"}, inplace=True)
retnet_genes = pd.Series(retnet["Gene.refGene"])
display(retnet.head())

# Merging with filtered annotated clinvar data
anno_clinvar_filt = pd.merge(anno_clinvar_filt, retnet, on="Gene.refGene", how="left")
display(anno_clinvar_filt)

Unnamed: 0,Symbols;\nOMIM Numbers,Gene.refGene,gene_id,Location,Diseases;_x000D_\nProtein,OMIM Phenotypes,How Identified;_x000D_\nComments,Chromosome,pLI,exac_pLI,AR,AD,AD GOF,AD Haploinsuffiency,Complex
0,"COL11A1, STL2;_x000D_\n 120280, 154780, 604841",COL11A1,ENSG00000060718,1p21.1,"dominant Stickler syndrome, type II; dominant ...","Fibrochondrogenesis 1, 228520 (3), Autosomal r...","linkage mapping, candidate gene; Stickler synd...",1,1.0,0.99998,0,1,0,1,
1,"PRPF3, HPRP3, PRP3, RP18;_x000D_\n 268000, 601...",PRPF3,ENSG00000117360,1q21.2,dominant retinitis pigmentosa; protein: pre-mR...,"Retinitis pigmentosa 18, 601414 (3), Autosomal...","linkage mapping, candidate gene; English and D...",1,1.0,1.0,0,1,0,1,
2,"MFN2, CMT6, CMT2A2, MARF;\n 608507, 609260, 60...",MFN2,ENSG00000116688,1p36.22,dominant optic atrophy with neuropathy and myo...,"Lipomatosis, multiple symmetric, with or witho...",candidate gene; dominant mutation in a large T...,1,0.99415,0.99982,0,1,0,1,
3,ELOVL1;_x000D_\n 611813,ELOVL1,ENSG00000066322,1p34.2,"dominant optic atrophy, deafness, ichthyosis a...","Ichthyotic keratoderma, spasticity, hypomyelin...","whole-exome sequencing; identical, de novo, do...",1,0.67897,0.22341,0,1,1,0,
4,"ADIPOR1, PAQR1;_x000D_\n 607945",ADIPOR1,ENSG00000159346,1q32.1,"recessive retinitis pigmentosa, syndromic, Bar...",,whole-exome sequencing; a homozygous frameshif...,1,0.6584,0.59275,1,1,1,0,1.0


Unnamed: 0,Chr,Start,End,Ref,Alt,QUAL,FILTER,INFO,ID,ALLELEID,...,OMIM Phenotypes,How Identified;_x000D_\nComments,Chromosome,pLI,exac_pLI,AR,AD,AD GOF,AD Haploinsuffiency,Complex
0,chr10,71142347,71142347,C,A,.,.,"ALLELEID=2671427;CLNDISDB=MONDO:MONDO:0032807,...",2506469,2671427,...,"Retinitis pigmentosa 79, 617460 (3), Autosomal...","linkage mapping, whole-exome sequencing; a mis...",10,9.145900e-01,5.557300e-01,1.0,1.0,1.0,0.0,1.0
1,chr20,10654126,10654126,A,C,.,.,"ALLELEID=2671423;CLNDISDB=MONDO:MONDO:0016862,...",2506465,2671423,...,"?Deafness, congenital heart defects, and poste...","deletion mapping, candidate gene; multiple aff...",20,1.000000e+00,1.000000e+00,0.0,1.0,0.0,1.0,
2,chr12,48380959,48380959,C,T,.,.,"ALLELEID=2671226;CLNDISDB=MONDO:MONDO:0008702,...",2506270,2671226,...,?Vitreoretinopathy with phalangeal epiphyseal ...,"linkage mapping, candidate gene; mutations in ...",12,1.000000e+00,1.000000e+00,0.0,1.0,0.0,1.0,
3,chr20,3897663,3897663,T,A,.,.,"ALLELEID=2671068;CLNDISDB=MONDO:MONDO:0009319,...",2506108,2671068,...,"HARP syndrome, 607236 (3), Autosomal recessive...","homozygosity mapping, candidate gene; symptoms...",20,7.403400e-07,1.079200e-02,1.0,0.0,0.0,0.0,
4,chr12,48378867,48378867,C,T,.,.,ALLELEID=2670853;CLNDISDB=MedGen:CN517202;CLND...,2505889,2670853,...,?Vitreoretinopathy with phalangeal epiphyseal ...,"linkage mapping, candidate gene; mutations in ...",12,1.000000e+00,1.000000e+00,0.0,1.0,0.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3325,chr11,86662324,86662324,C,G,.,.,ALLELEID=975335;CLNDISDB=MedGen:CN517202;CLNDN...,986930,975335,...,"Retinopathy of prematurity, 133780 (3), Autoso...","linkage mapping, candidate gene; Criswick-Sche...",11,9.696200e-01,9.207900e-01,0.0,1.0,0.0,1.0,
3326,chr1,94506874,94506874,A,T,.,.,ALLELEID=974993;CLNDISDB=MedGen:CN517202;CLNDN...,986929,974993,...,"Retinal dystrophy, early-onset severe, 248200 ...","linkage mapping, candidate gene; may be involv...",1,5.349900e-48,1.267600e-28,1.0,0.0,0.0,0.0,
3327,chr1,103354176,103354176,C,A,.,.,ALLELEID=974943;CLNDISDB=MedGen:CN517202;CLNDN...,986890,974943,...,"Fibrochondrogenesis 1, 228520 (3), Autosomal r...","linkage mapping, candidate gene; Stickler synd...",1,1.000000e+00,9.999800e-01,0.0,1.0,0.0,1.0,
3328,chr6,42146081,42146081,G,A,.,.,ALLELEID=975196;CLNDISDB=MedGen:CN517202;CLNDN...,986883,975196,...,"Cone-rod dystrophy 14, 602093 (3), Autosomal d...","linkage mapping, candidate gene; British famil...",6,4.142900e-04,9.918300e-03,0.0,1.0,1.0,0.0,


In [9]:
anno_clinvar_filt["PATHOGENICITY"].value_counts()

BLB    1948
PLP    1382
Name: PATHOGENICITY, dtype: int64

In [62]:
anno_clinvar_filt.columns

Index(['Chr', 'Start', 'End', 'Ref', 'Alt', 'QUAL', 'FILTER', 'INFO', 'ID',
       'ALLELEID',
       ...
       'OMIM Phenotypes', 'How Identified;_x000D_\nComments', 'Chromosome',
       'pLI', 'exac_pLI', 'AR', 'AD', 'AD GOF', 'AD Haploinsuffiency',
       'Complex'],
      dtype='object', length=154)

In [64]:
# Optional export to csv and excel
#anno_clinvar_filt.to_csv(cwd+"/annovar_files/clinvar_filt_retnet_annotated_PLP-BLB_Nonsynonomous_Variants.csv", index=False)
#anno_clinvar_filt.to_excel(cwd+"/annovar_files/clinvar_filt_retnet_annotated_PLP-BLB_Nonsynonomous_Variants.xlsx", index=False) #slow

In [10]:
# Getting lists of genes with various inheritance patterns
ar_all_genes = retnet[retnet["AR"]==1]["Gene.refGene"]
ar_only_genes = retnet[(retnet["AR"]==1) & (retnet["AD"]==0)]["Gene.refGene"]
ad_all_genes = retnet[retnet["AD"]==1]["Gene.refGene"]
ad_only_genes = retnet[(retnet["AD"]==1) & (retnet["AR"]==0)]["Gene.refGene"]
ad_gof_all_genes = retnet[retnet["AD GOF"]==1]["Gene.refGene"]
ad_gof_only_genes = retnet[(retnet["AD GOF"]==1) & (retnet["AR"]==0)]["Gene.refGene"]
ad_haplo_all_genes = retnet[retnet["AD Haploinsuffiency"]==1]["Gene.refGene"]
ad_haplo_only_genes = retnet[(retnet["AD Haploinsuffiency"]==1) & (retnet["AR"]==0)]["Gene.refGene"]

# Filting IRD ClinVar variants into different dfs for subsequent ROC + AUC analysis
ar_all = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ar_all_genes)]
print("ALL Autosomal Recessive variants = {}".format(ar_all.shape))

ar_only = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ar_only_genes)]
print("ONLY Autosomal Recessive variants = {}".format(ar_only.shape))

ad_all = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ad_all_genes)]
print("\nALL Autosomal Dominant variants = {}".format(ad_all.shape))

ad_only = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ad_only_genes)]
print("ONLY Autosomal Dominant variants = {}".format(ad_only.shape))

ad_gof_all = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ad_gof_all_genes)]
print("\nALL Gain-of-Function Autosomal Dominant variants = {}".format(ad_gof_all.shape))

ad_gof_only = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ad_gof_only_genes)]
print("ONLY Gain-of-Function Autosomal Dominant variants = {}".format(ad_gof_only.shape))

ad_haplo_all = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ad_haplo_all_genes)]
print("\nALL Haploinsuffient Autosomal Dominant variants = {}".format(ad_haplo_all.shape))

ad_haplo_only = anno_clinvar_filt[anno_clinvar_filt["Gene.refGene"].isin(ad_haplo_only_genes)]
print("ONLY Haploinsuffient Autosomal Dominant variants = {}".format(ad_haplo_only.shape))

ALL Autosomal Recessive variants = (2392, 154)
ONLY Autosomal Recessive variants = (1752, 154)

ALL Autosomal Dominant variants = (1524, 154)
ONLY Autosomal Dominant variants = (884, 154)

ALL Gain-of-Function Autosomal Dominant variants = (761, 154)
ONLY Gain-of-Function Autosomal Dominant variants = (156, 154)

ALL Haploinsuffient Autosomal Dominant variants = (763, 154)
ONLY Haploinsuffient Autosomal Dominant variants = (728, 154)


In [11]:
# Number of variants per gene
gene_variant_freq = pd.DataFrame(anno_clinvar_filt["Gene.refGene"].value_counts()).reset_index()
display(gene_variant_freq.head())
#gene_variant_freq.to_excel(cwd+"/results/gene_variant_frequency.xlsx", index=False)

Unnamed: 0,index,Gene.refGene
0,COL2A1,298
1,ABCA4,192
2,COL11A1,135
3,ADGRV1,130
4,NBAS,118


## **Generating ROC and AUC values**

### ALL Autosomal Recessive IRD SNVs Performance

In [12]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ar_all_data = []
roc_ar_all_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ar_all, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ar_all_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ar_all_df = pd.concat([roc_ar_all_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ar_all_df = pd.DataFrame(auc_ar_all_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AR ALL DF shape: {}".format(auc_ar_all_df.shape))
display(auc_ar_all_df.head())
print("\nROC AR ALL DF shape: {}".format(roc_ar_all_df.shape))
display(roc_ar_all_df.head())

MutScore AUC = 0.96242 while using 96.53% of the values.
VEST4_score AUC = 0.93324 while using 96.739% of the values.
REVEL_score AUC = 0.95337 while using 96.53% of the values.
PROVEAN_converted_rankscore AUC = 0.89119 while using 93.144% of the values.
MutPred_score AUC = 0.89709 while using 63.294% of the values.
MVP_score AUC = 0.91972 while using 92.349% of the values.
CADD_raw AUC = 0.87433 while using 97.199% of the values.
MetaSVM_score AUC = 0.91626 while using 96.53% of the values.
MetaRNN_score AUC = 0.98291 while using 97.199% of the values.
MetaLR_score AUC = 0.90568 while using 96.53% of the values.
M-CAP_score AUC = 0.8994 while using 88.253% of the values.
DEOGEN2_score AUC = 0.91722 while using 90.385% of the values.
FATHMM_converted_rankscore AUC = 0.81636 while using 92.266% of the values.
fathmm-MKL_coding_score AUC = 0.84152 while using 97.199% of the values.
fathmm-XF_coding_score AUC = 0.8931 while using 84.992% of the values.
SIFT_converted_rankscore AUC = 0.862

Unnamed: 0,Model,AUC,Percent_Retained
26,ClinPred_score,0.984252,96.613712
8,MetaRNN_score,0.982913,97.198997
24,BayesDel_addAF_score,0.975842,96.822742
0,MutScore,0.962417,96.5301
2,REVEL_score,0.953366,96.5301



ROC AR ALL DF shape: (21362, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.036417,1.0,MutScore
2,0.0,0.063976,0.999,MutScore
3,0.0,0.094488,0.998,MutScore
4,0.0,0.122047,0.997,MutScore


In [29]:
# Exporting to graph in R
#auc_ar_all_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AR_ALL_performance.xlsx", index=False)
#roc_ar_all_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AR_ALL_performance.xlsx", index=False)

### ONLY Autosomal Recessive IRD SNVs Performance

In [30]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ar_only_data = []
roc_ar_only_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ar_only, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ar_only_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ar_only_df = pd.concat([roc_ar_only_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ar_only_df = pd.DataFrame(auc_ar_only_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AR ONLY DF shape: {}".format(auc_ar_only_df.shape))
display(auc_ar_only_df.head())
print("\nROC AR ONLY DF shape: {}".format(roc_ar_only_df.shape))
display(roc_ar_only_df.head())

MutScore AUC = 0.95801 while using 95.585% of the values.
VEST4_score AUC = 0.93679 while using 95.929% of the values.
REVEL_score AUC = 0.95324 while using 95.585% of the values.
PROVEAN_converted_rankscore AUC = 0.88772 while using 91.628% of the values.
MutPred_score AUC = 0.8955 while using 58.83% of the values.
MVP_score AUC = 0.91828 while using 90.539% of the values.
CADD_raw AUC = 0.88489 while using 96.502% of the values.
MetaSVM_score AUC = 0.9184 while using 95.585% of the values.
MetaRNN_score AUC = 0.98299 while using 96.502% of the values.
MetaLR_score AUC = 0.90693 while using 95.585% of the values.
M-CAP_score AUC = 0.90041 while using 86.353% of the values.
DEOGEN2_score AUC = 0.91381 while using 90.883% of the values.
FATHMM_converted_rankscore AUC = 0.81743 while using 90.31% of the values.
fathmm-MKL_coding_score AUC = 0.85267 while using 96.502% of the values.
fathmm-XF_coding_score AUC = 0.89918 while using 82.511% of the values.
SIFT_converted_rankscore AUC = 0.8

Unnamed: 0,Model,AUC,Percent_Retained
26,ClinPred_score,0.98459,95.756881
8,MetaRNN_score,0.982994,96.502294
24,BayesDel_addAF_score,0.97465,96.043578
0,MutScore,0.958007,95.584862
2,REVEL_score,0.953237,95.584862



ROC AR ONLY DF shape: (17046, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.034175,1.0,MutScore
2,0.0,0.057949,0.999,MutScore
3,0.0,0.10847,0.997,MutScore
4,0.0,0.132244,0.996,MutScore


In [32]:
# Exporting to graph in R
#auc_ar_only_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AR_ONLY_performance.xlsx", index=False)
#roc_ar_only_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AR_ONLY_performance.xlsx", index=False)

### ALL Autosomal Dominant (GOF + Haploinsuffiency + AR Overlap) IRD SNVs Performance

In [33]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ad_all_data = []
roc_ad_all_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ad_all, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ad_all_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ad_all_df = pd.concat([roc_ad_all_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ad_all_df = pd.DataFrame(auc_ad_all_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AD ALL DF shape: {}".format(auc_ad_all_df.shape))
display(auc_ad_all_df.head())
print("\nROC AD ALL DF shape: {}".format(roc_ad_all_df.shape))
display(roc_ad_all_df.head())

MutScore AUC = 0.96883 while using 99.147% of the values.
VEST4_score AUC = 0.92845 while using 99.081% of the values.
REVEL_score AUC = 0.94261 while using 99.147% of the values.
PROVEAN_converted_rankscore AUC = 0.88802 while using 98.36% of the values.
MutPred_score AUC = 0.92564 while using 75.787% of the values.
MVP_score AUC = 0.93086 while using 97.835% of the values.
CADD_raw AUC = 0.84611 while using 99.147% of the values.
MetaSVM_score AUC = 0.89983 while using 99.147% of the values.
MetaRNN_score AUC = 0.96832 while using 99.147% of the values.
MetaLR_score AUC = 0.89867 while using 99.147% of the values.
M-CAP_score AUC = 0.92198 while using 95.079% of the values.
DEOGEN2_score AUC = 0.9074 while using 92.126% of the values.
FATHMM_converted_rankscore AUC = 0.82305 while using 98.491% of the values.
fathmm-MKL_coding_score AUC = 0.77145 while using 99.147% of the values.
fathmm-XF_coding_score AUC = 0.84277 while using 95.997% of the values.
SIFT_converted_rankscore AUC = 0

Unnamed: 0,Model,AUC,Percent_Retained
0,MutScore,0.968828,99.146982
8,MetaRNN_score,0.968321,99.146982
26,ClinPred_score,0.967731,99.081365
24,BayesDel_addAF_score,0.962108,99.081365
2,REVEL_score,0.942611,99.146982



ROC AD ALL DF shape: (15235, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.040172,1.0,MutScore
2,0.0,0.077475,0.999,MutScore
3,0.0,0.13056,0.998,MutScore
4,0.0,0.177905,0.997,MutScore


In [35]:
# Exporting to graph in R
#auc_ad_all_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AD_ALL_performance.xlsx", index=False)
#roc_ad_all_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AD_ALL_performance.xlsx", index=False)

### ONLY Autosomal Dominant (GOF + Haploinsuffiency with NO AR Overlap) IRD SNVs Performance

In [36]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ad_only_data = []
roc_ad_only_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ad_only, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ad_only_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ad_only_df = pd.concat([roc_ad_only_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ad_only_df = pd.DataFrame(auc_ad_only_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AD ONLY DF shape: {}".format(auc_ad_only_df.shape))
display(auc_ad_only_df.head())
print("\nROC AD ONLY DF shape: {}".format(roc_ad_only_df.shape))
display(roc_ad_only_df.head())

MutScore AUC = 0.9713 while using 99.208% of the values.
VEST4_score AUC = 0.94309 while using 99.208% of the values.
REVEL_score AUC = 0.9618 while using 99.208% of the values.
PROVEAN_converted_rankscore AUC = 0.88406 while using 99.208% of the values.
MutPred_score AUC = 0.94521 while using 76.244% of the values.
MVP_score AUC = 0.94857 while using 98.303% of the values.
CADD_raw AUC = 0.87664 while using 99.208% of the values.
MetaSVM_score AUC = 0.91595 while using 99.208% of the values.
MetaRNN_score AUC = 0.96994 while using 99.208% of the values.
MetaLR_score AUC = 0.93082 while using 99.208% of the values.
M-CAP_score AUC = 0.9473 while using 96.38% of the values.
DEOGEN2_score AUC = 0.92846 while using 94.457% of the values.
FATHMM_converted_rankscore AUC = 0.88163 while using 99.208% of the values.
fathmm-MKL_coding_score AUC = 0.78146 while using 99.208% of the values.
fathmm-XF_coding_score AUC = 0.84714 while using 99.208% of the values.
SIFT_converted_rankscore AUC = 0.9

Unnamed: 0,Model,AUC,Percent_Retained
24,BayesDel_addAF_score,0.974398,99.208145
26,ClinPred_score,0.97215,99.208145
0,MutScore,0.971298,99.208145
8,MetaRNN_score,0.969937,99.208145
2,REVEL_score,0.961801,99.208145



ROC AD ONLY DF shape: (8840, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.039326,1.0,MutScore
2,0.0,0.078652,0.999,MutScore
3,0.0,0.143258,0.998,MutScore
4,0.0,0.205056,0.997,MutScore


In [38]:
# Exporting to graph in R
#auc_ad_only_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AD_ONLY_performance.xlsx", index=False)
#roc_ad_only_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AD_ONLY_performance.xlsx", index=False)

### ALL GOF Autosomal Dominant (with AR Overlap) IRD SNVs Performance

In [39]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ad_gof_all_data = []
roc_ad_gof_all_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ad_gof_all, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ad_gof_all_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ad_gof_all_df = pd.concat([roc_ad_gof_all_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ad_gof_all_df = pd.DataFrame(auc_ad_gof_all_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AD GOF ALL DF shape: {}".format(auc_ad_gof_all_df.shape))
display(auc_ad_gof_all_df.head())
print("\nROC AD GOF ALL DF shape: {}".format(roc_ad_gof_all_df.shape))
display(roc_ad_gof_all_df.head())

MutScore AUC = 0.97222 while using 99.08% of the values.
VEST4_score AUC = 0.92351 while using 98.949% of the values.
REVEL_score AUC = 0.94895 while using 99.08% of the values.
PROVEAN_converted_rankscore AUC = 0.89337 while using 97.503% of the values.
MutPred_score AUC = 0.88684 while using 68.2% of the values.
MVP_score AUC = 0.91504 while using 96.583% of the values.
CADD_raw AUC = 0.83245 while using 99.08% of the values.
MetaSVM_score AUC = 0.91251 while using 99.08% of the values.
MetaRNN_score AUC = 0.9854 while using 99.08% of the values.
MetaLR_score AUC = 0.90866 while using 99.08% of the values.
M-CAP_score AUC = 0.90462 while using 91.459% of the values.
DEOGEN2_score AUC = 0.9278 while using 86.202% of the values.
FATHMM_converted_rankscore AUC = 0.82988 while using 97.766% of the values.
fathmm-MKL_coding_score AUC = 0.79884 while using 99.08% of the values.
fathmm-XF_coding_score AUC = 0.86836 while using 96.189% of the values.
SIFT_converted_rankscore AUC = 0.87888 wh

Unnamed: 0,Model,AUC,Percent_Retained
8,MetaRNN_score,0.985403,99.080158
24,BayesDel_addAF_score,0.98395,98.948752
26,ClinPred_score,0.982704,98.948752
0,MutScore,0.97222,99.080158
2,REVEL_score,0.948948,99.080158



ROC AD GOF ALL DF shape: (8672, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.040541,1.0,MutScore
2,0.0,0.072973,0.999,MutScore
3,0.0,0.113514,0.998,MutScore
4,0.0,0.143243,0.997,MutScore


In [41]:
# Exporting to graph in R
#auc_ad_gof_all_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AD_GOF_ALL_performance.xlsx", index=False)
#roc_ad_gof_all_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AD_GOF_ALL_performance.xlsx", index=False)

### ONLY GOF Autosomal Dominant (with NO AR Overlap) IRD SNVs Performance

In [42]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ad_gof_only_data = []
roc_ad_gof_only_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ad_gof_only, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ad_gof_only_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ad_gof_only_df = pd.concat([roc_ad_gof_only_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ad_gof_only_df = pd.DataFrame(auc_ad_gof_only_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AD GOF ONLY DF shape: {}".format(auc_ad_gof_only_df.shape))
display(auc_ad_gof_only_df.head())
print("\nROC AD GOF ONLY DF shape: {}".format(roc_ad_gof_only_df.shape))
display(roc_ad_gof_only_df.head())

MutScore AUC = 0.97234 while using 99.359% of the values.
VEST4_score AUC = 0.94293 while using 99.359% of the values.
REVEL_score AUC = 0.96394 while using 99.359% of the values.
PROVEAN_converted_rankscore AUC = 0.87255 while using 99.359% of the values.
MutPred_score AUC = 0.88312 while using 43.59% of the values.
MVP_score AUC = 0.92005 while using 94.231% of the values.
CADD_raw AUC = 0.8436 while using 99.359% of the values.
MetaSVM_score AUC = 0.93184 while using 99.359% of the values.
MetaRNN_score AUC = 0.99323 while using 99.359% of the values.
MetaLR_score AUC = 0.93651 while using 99.359% of the values.
M-CAP_score AUC = 0.91199 while using 83.974% of the values.
DEOGEN2_score AUC = 0.93465 while using 73.077% of the values.
FATHMM_converted_rankscore AUC = 0.90966 while using 99.359% of the values.
fathmm-MKL_coding_score AUC = 0.82026 while using 99.359% of the values.
fathmm-XF_coding_score AUC = 0.89379 while using 99.359% of the values.
SIFT_converted_rankscore AUC = 0

Unnamed: 0,Model,AUC,Percent_Retained
24,BayesDel_addAF_score,0.997316,99.358974
8,MetaRNN_score,0.993231,99.358974
26,ClinPred_score,0.991246,99.358974
0,MutScore,0.972339,99.358974
25,BayesDel_noAF_score,0.964286,99.358974



ROC AD GOF ONLY DF shape: (1564, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.027778,1.0,MutScore
2,0.0,0.194444,0.993,MutScore
3,0.0,0.25,0.991,MutScore
4,0.0,0.333333,0.973,MutScore


In [44]:
# Exporting to graph in R
#auc_ad_gof_only_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AD_GOF_ONLY_performance.xlsx", index=False)
#roc_ad_gof_only_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AD_GOF_ONLY_performance.xlsx", index=False)

### ALL Haploinsuffiency Autosomal Dominant (with AR Overlap) IRD SNVs Performance

In [13]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ad_haplo_all_data = []
roc_ad_haplo_all_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ad_haplo_all, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ad_haplo_all_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ad_haplo_all_df = pd.concat([roc_ad_haplo_all_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ad_haplo_all_df = pd.DataFrame(auc_ad_haplo_all_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AD HAPLO ALL DF shape: {}".format(auc_ad_haplo_all_df.shape))
display(auc_ad_haplo_all_df.head())
print("\nROC AD HAPLO ALL DF shape: {}".format(roc_ad_haplo_all_df.shape))
display(roc_ad_haplo_all_df.head())

MutScore AUC = 0.97252 while using 99.214% of the values.
VEST4_score AUC = 0.94246 while using 99.214% of the values.
REVEL_score AUC = 0.96242 while using 99.214% of the values.
PROVEAN_converted_rankscore AUC = 0.88922 while using 99.214% of the values.
MutPred_score AUC = 0.95069 while using 83.355% of the values.
MVP_score AUC = 0.95611 while using 99.083% of the values.
CADD_raw AUC = 0.88468 while using 99.214% of the values.
MetaSVM_score AUC = 0.91798 while using 99.214% of the values.
MetaRNN_score AUC = 0.9653 while using 99.214% of the values.
MetaLR_score AUC = 0.9354 while using 99.214% of the values.
M-CAP_score AUC = 0.9546 while using 98.689% of the values.
DEOGEN2_score AUC = 0.92934 while using 98.034% of the values.
FATHMM_converted_rankscore AUC = 0.88797 while using 99.214% of the values.
fathmm-MKL_coding_score AUC = 0.77838 while using 99.214% of the values.
fathmm-XF_coding_score AUC = 0.83578 while using 95.806% of the values.
SIFT_converted_rankscore AUC = 0.

Unnamed: 0,Model,AUC,Percent_Retained
0,MutScore,0.97252,99.21363
24,BayesDel_addAF_score,0.971709,99.21363
26,ClinPred_score,0.970173,99.21363
8,MetaRNN_score,0.965305,99.21363
2,REVEL_score,0.962424,99.21363



ROC AD HAPLO ALL DF shape: (7476, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.039755,1.0,MutScore
2,0.0,0.082569,0.999,MutScore
3,0.0,0.217125,0.997,MutScore
4,0.002326,0.269113,0.996,MutScore


In [47]:
# Exporting to graph in R
#auc_ad_haplo_all_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AD_HAPLO_ALL_performance.xlsx", index=False)
#roc_ad_haplo_all_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AD_HAPLO_ALL_performance.xlsx", index=False)

### ONLY Haploinsuffiency Autosomal Dominant (with NO AR Overlap) IRD SNVs Performance

In [48]:
# List of models to calculate ROC & AUC values
models = ["MutScore", "VEST4_score", "REVEL_score", "PROVEAN_converted_rankscore", "MutPred_score", "MVP_score", "CADD_raw", 
          "MetaSVM_score", "MetaRNN_score", "MetaLR_score", "M-CAP_score", "DEOGEN2_score", "FATHMM_converted_rankscore", 
          "fathmm-MKL_coding_score", "fathmm-XF_coding_score", "SIFT_converted_rankscore", "SIFT4G_converted_rankscore", 
          "Polyphen2_HDIV_score", "Polyphen2_HVAR_score", "LRT_converted_rankscore", "MutationTaster_converted_rankscore", 
          "MutationAssessor_score", "MPC_score", "PrimateAI_score", "BayesDel_addAF_score", "BayesDel_noAF_score", 
          "ClinPred_score", "LIST-S2_score", "DANN_score", "Eigen-raw_coding", "Eigen-PC-raw_coding", "GenoCanyon_rankscore", 
          "integrated_fitCons_score", "GERP++_RS", "phyloP100way_vertebrate", "phyloP30way_mammalian", "phastCons100way_vertebrate", 
          "phastCons30way_mammalian", "SiPhy_29way_logOdds"]

# Looping and creating lists/dfs to append results to
auc_ad_haplo_only_data = []
roc_ad_haplo_only_df = pd.DataFrame(columns=["fpr", "tpr", "threshold", "model"])
for m in models:
    # Calulating parametrics for each prediction model
    fpr, tpr, threshold, auc_score, percent_retained = roc_auc(df=ad_haplo_only, model=m, truth="y_test", disp_results=True)
    
    # Appending a list with a tuple containing model name, its AUC score, and its percent SNV retained
    auc_ad_haplo_only_data.append((m, auc_score, percent_retained))
    
    # Making a dataframe with fpr, tpr, thresholds, and model ID for drawing ROC curves
    model_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": threshold, "model": np.full(len(fpr), m)})
    roc_ad_haplo_only_df = pd.concat([roc_ad_haplo_only_df, model_df], axis=0)

# Making the AUC dataframe from list of tuples
auc_ad_haplo_only_df = pd.DataFrame(auc_ad_haplo_only_data, columns=["Model", "AUC", "Percent_Retained"]).sort_values("AUC", ascending=False)

# Displaying results
print("\nAUC AD HAPLO ONLY DF shape: {}".format(auc_ad_haplo_only_df.shape))
display(auc_ad_haplo_only_df.head())
print("\nROC AD HAPLO ONLY DF shape: {}".format(roc_ad_haplo_only_df.shape))
display(roc_ad_haplo_only_df.head())

MutScore AUC = 0.9707 while using 99.176% of the values.
VEST4_score AUC = 0.94096 while using 99.176% of the values.
REVEL_score AUC = 0.96054 while using 99.176% of the values.
PROVEAN_converted_rankscore AUC = 0.88238 while using 99.176% of the values.
MutPred_score AUC = 0.94899 while using 83.242% of the values.
MVP_score AUC = 0.95349 while using 99.176% of the values.
CADD_raw AUC = 0.87839 while using 99.176% of the values.
MetaSVM_score AUC = 0.91298 while using 99.176% of the values.
MetaRNN_score AUC = 0.96393 while using 99.176% of the values.
MetaLR_score AUC = 0.93169 while using 99.176% of the values.
M-CAP_score AUC = 0.95481 while using 99.038% of the values.
DEOGEN2_score AUC = 0.92972 while using 99.038% of the values.
FATHMM_converted_rankscore AUC = 0.88308 while using 99.176% of the values.
fathmm-MKL_coding_score AUC = 0.76377 while using 99.176% of the values.
fathmm-XF_coding_score AUC = 0.83276 while using 99.176% of the values.
SIFT_converted_rankscore AUC = 

Unnamed: 0,Model,AUC,Percent_Retained
24,BayesDel_addAF_score,0.971774,99.175824
0,MutScore,0.970697,99.175824
26,ClinPred_score,0.967949,99.175824
8,MetaRNN_score,0.963926,99.175824
2,REVEL_score,0.960537,99.175824



ROC AD HAPLO ONLY DF shape: (7187, 4)


Unnamed: 0,fpr,tpr,threshold,model
0,0.0,0.0,2.0,MutScore
1,0.0,0.040625,1.0,MutScore
2,0.0,0.084375,0.999,MutScore
3,0.0,0.153125,0.998,MutScore
4,0.0,0.21875,0.997,MutScore


In [50]:
# Exporting to graph in R
#auc_ad_haplo_only_df.to_excel(cwd+"/AUC_ROC_results/auc/AUC_AD_HAPLO_ONLY_performance.xlsx", index=False)
#roc_ad_haplo_only_df.to_excel(cwd+"/AUC_ROC_results/roc/ROC_AD_HAPLO_ONLY_performance.xlsx", index=False)