In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
df_valid = pd.read_csv(
        # "../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_train_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_valid_with_DBN_predictions.csv",
        # "../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_test_with_DBN_predictions.csv",
#         "../Data/genie_datasets/DBN_predictions/all_var_BN_model/Prov_cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_using_UCLA_discritizer_with_DBN_predictions.csv",
    )

## Estimate metrics over varying threshold

In [5]:
thresholds = np.linspace(0,1,100)
num_of_epochs = 1

epoch_stats = dict()

# looping over epochs
for epoch_num in tqdm(range(0, num_of_epochs)):
    
    # epoch metrics dict
    epoch_cm_metrics, epoch_metrics = dict(), dict()
    
    # target variable
    target = "year" + str(1 + epoch_num) + "_reduction_40_ge"
    truth = df_valid[target].str.replace("S_", "").astype(int).values
    predictions = df_valid["predictions_year" + str(epoch_num + 1)]
    tns, fps, fns, tps = [], [], [], []
    precisions, recalls, specificities, f1_scores = [], [], [], []
    
    # looping over thresholds
    for threshold in thresholds:
        preds = (predictions>threshold)*1
        tn, fp, fn, tp = confusion_matrix(truth, preds).ravel()
        
        # confusion matrices
        tns.append(tn)
        fps.append(fp)
        fns.append(fn)
        tps.append(tp)
        
        precision = precision_score(truth,preds)
        recall = recall_score(truth,preds)
        specificity = float(tn) / float(tn+fp)
        f1Score = f1_score(truth, preds)
        
        # metrics
        precisions.append(precision)
        recalls.append(recall)
        specificities.append(specificity)
        f1_scores.append(f1Score)
    
    epoch_cm_metrics["TNs"] = tns
    epoch_cm_metrics["FPs"] = fps
    epoch_cm_metrics["FNs"] = fns
    epoch_cm_metrics["TPs"] = tps
    
    epoch_metrics["Precision/PPV"] = precisions
    epoch_metrics["Recall/Sensitivity"] = recalls
    epoch_metrics["Specificity"] = specificities
    epoch_metrics["F1 score"] = f1_scores
    
    epoch_stats["epoch_" + str(epoch_num + 1)] = [epoch_cm_metrics] + [epoch_metrics]

100%|██████████| 1/1 [00:04<00:00,  4.17s/it]


## Plot metrics

In [11]:
import plotly.graph_objects as go


for epoch_num in range(1,2):
    
    names = list(epoch_stats["epoch_" + str(epoch_num)][1].keys())
    
    # Create traces
    fig = go.Figure()

    for name in names:
        fig.add_trace(go.Scatter(x=[round(threshold,2) for threshold in thresholds], 
                                 y=[round(val,3) for val in epoch_stats["epoch_" + str(epoch_num)][1][name]],
                                 mode='lines+markers',
                                 name=name))

    # Edit the layout
    fig.update_layout(title='Metrics Vs Threshold epoch ' + str(epoch_num),
                       xaxis_title='Thresholds',
                       yaxis_title='Metric score [0,1]')

    fig.show()

## Plot CM metrics

In [13]:
import plotly.graph_objects as go


for epoch_num in range(1,2):
    
    names = list(epoch_stats["epoch_" + str(epoch_num)][0].keys())
    
    # Create traces
    fig = go.Figure()

    for name in names:
        fig.add_trace(go.Scatter(x=[round(threshold,2) for threshold in thresholds], 
                                 y=epoch_stats["epoch_" + str(epoch_num)][0][name],
                                 mode='lines+markers',
                                 name=name))

    # Edit the layout
    fig.update_layout(title='CM Metrics Vs Threshold epoch ' + str(epoch_num),
                       xaxis_title='Thresholds',
                       yaxis_title='Metric count')

    fig.show()

# Evaluation after choosing threshold

In [23]:
filenames = [
        "../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_train_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_valid_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_test_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_BN_model/Prov_cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_using_UCLA_discritizer_with_DBN_predictions.csv"
]

df_all_results = pd.DataFrame()


for filename in filenames:
    
    df_valid = pd.read_csv(filename, low_memory=False)
    
    print(filename.split('/')[-1])
    
    num_of_epochs = 1
    
    cols = ["Dataset","Metric"]+["Year " + str(epoch_num + 1) for epoch_num in range(num_of_epochs)]
    
    auc_rocs, ap_aucs = [filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                    ,"AUC ROC"],[filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                      ,"AP"]

    tns, fps, fns, tps = [filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                          ,"TNs"],[filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                                   ,"FPs"],[filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                                            ,"FNs"],[filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                                                     ,"TPs"]
    precisions, recalls = [filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                           ,"Precision/PPV"],[filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                                              ,"Recall/Sensitivity"]
    specificities, f1Scores = [filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                               ,"Specificity"],[filename.replace("../Data/genie_datasets/DBN_predictions/all_var_BN_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                                                ,"F1 score"]
    filenames_epochs = []

    # looping over epochs
    for epoch_num in range(0, num_of_epochs):

        # target variable
        target = "year" + str(1 + epoch_num) + "_reduction_40_ge"
        truth = df_valid[target].str.replace("S_", "").astype(int).values
        predictions = df_valid["predictions_year" + str(epoch_num + 1)]

        auc_roc = roc_auc_score(truth,predictions)
        ap_score = average_precision_score(truth,predictions)
        auc_rocs.append(round(auc_roc,3))
        ap_aucs.append(round(ap_score,3))

        # testing optimal threshold
        threshold = 0.14
        preds = (predictions>threshold)*1
        tn, fp, fn, tp = confusion_matrix(truth, preds).ravel()
        
        tns.append(tn)
        fps.append(fp) 
        fns.append(fn) 
        tps.append(tp)

        print("epoch " + str(epoch_num + 1))
        
        # aucs
        print("AUC ROC: ")
        print(auc_roc)
        print("AP: ")
        print(ap_score)

        # confusion matrices
        print("TN, FP, FN, TP :")
        print(tn, fp, fn, tp)

        precision = precision_score(truth,preds)
        recall = recall_score(truth,preds)
        specificity = float(tn) / float(tn+fp)
        f1Score = f1_score(truth, preds)
        
        precisions.append(round(precision,3))
        recalls.append(round(recall,3))
        specificities.append(round(specificity,3)) 
        f1Scores.append(round(f1Score,3))

        # metrics
        print("Precision, Recall, Spec, F1 score :")
        print(precision,recall,specificity,f1Score)
        
    metrics = [auc_rocs, ap_aucs, tns, fps, fns, tps, precisions, recalls, specificities, f1Scores]
    df_results = pd.DataFrame(data=metrics,columns=cols)
        
    df_all_results = df_all_results.append(df_results)
    
    print('\n')

cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_train_with_DBN_predictions.csv
epoch 1
AUC ROC: 
0.766575037709348
AP: 
0.04211919642714291
TN, FP, FN, TP :
96346 41776 343 876
Precision, Recall, Spec, F1 score :
0.020538310044077652 0.7186218211648893 0.6975427520597732 0.039935264753481796


cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_valid_with_DBN_predictions.csv
epoch 1
AUC ROC: 
0.7373884354922731
AP: 
0.033472963560965296
TN, FP, FN, TP :
31971 14101 111 264
Precision, Recall, Spec, F1 score :
0.01837800208840933 0.704 0.6939355790935926 0.03582089552238806


cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_test_with_DBN_predictions.csv
epoch 1
AUC ROC: 
0.7638874005511712
AP: 
0.03308346028000504
TN, FP, FN, TP :
31902 14138 116 291
Precision, Recall, Spec, F1 score :
0.020167717790560678 0.714987714987715 0.6929192006950478 0.039228902669183066


Prov_cure_ckd_egfr_registry_preprocessed_project_pr

In [20]:
df_all_results.to_csv("../Data/genie_datasets/DBN_predictions/Results/all_vars_model_BN_results.csv",index=False)

In [24]:
df_all_results

Unnamed: 0,Dataset,Metric,Year 1
0,UCLA_train_with_DBN_predictions.csv,AUC ROC,0.767
1,UCLA_train_with_DBN_predictions.csv,AP,0.042
2,UCLA_train_with_DBN_predictions.csv,TNs,96346.0
3,UCLA_train_with_DBN_predictions.csv,FPs,41776.0
4,UCLA_train_with_DBN_predictions.csv,FNs,343.0
5,UCLA_train_with_DBN_predictions.csv,TPs,876.0
6,UCLA_train_with_DBN_predictions.csv,Precision/PPV,0.021
7,UCLA_train_with_DBN_predictions.csv,Recall/Sensitivity,0.719
8,UCLA_train_with_DBN_predictions.csv,Specificity,0.698
9,UCLA_train_with_DBN_predictions.csv,F1 score,0.04
