In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
df_valid = pd.read_csv(
        # "../Data/genie_datasets/DBN_predictions/all_var_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_train_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_dbn_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_valid_with_DBN_predictions.csv",
        # "../Data/genie_datasets/DBN_predictions/all_var_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_test_with_DBN_predictions.csv",
#         "../Data/genie_datasets/DBN_predictions/all_var_model/Prov_cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_using_UCLA_discritizer_with_DBN_predictions.csv",
    )

## Estimate metrics over varying threshold

In [None]:
thresholds = np.linspace(0,1,100)
num_of_epochs = 5

epoch_stats = dict()

# looping over epochs
for epoch_num in tqdm(range(0, num_of_epochs)):
    
    # epoch metrics dict
    epoch_cm_metrics, epoch_metrics = dict(), dict()
    
    # target variable
    target = "year" + str(1 + epoch_num) + "_reduction_40_ge"
    truth = df_valid[target].str.replace("S_", "").astype(int).values
    predictions = df_valid["predictions_year" + str(epoch_num + 1)]
    tns, fps, fns, tps = [], [], [], []
    precisions, recalls, specificities, f1_scores = [], [], [], []
    
    # looping over thresholds
    for threshold in thresholds:
        preds = (predictions>threshold)*1
        tn, fp, fn, tp = confusion_matrix(truth, preds).ravel()
        
        # confusion matrices
        tns.append(tn)
        fps.append(fp)
        fns.append(fn)
        tps.append(tp)
        
        precision = precision_score(truth,preds)
        recall = recall_score(truth,preds)
        specificity = float(tn) / float(tn+fp)
        f1Score = f1_score(truth, preds)
        
        # metrics
        precisions.append(precision)
        recalls.append(recall)
        specificities.append(specificity)
        f1_scores.append(f1Score)
    
    epoch_cm_metrics["TNs"] = tns
    epoch_cm_metrics["FPs"] = fps
    epoch_cm_metrics["FNs"] = fns
    epoch_cm_metrics["TPs"] = tps
    
    epoch_metrics["Precision/PPV"] = precisions
    epoch_metrics["Recall/Sensitivity"] = recalls
    epoch_metrics["Specificity"] = specificities
    epoch_metrics["F1 score"] = f1_scores
    
    epoch_stats["epoch_" + str(epoch_num + 1)] = [epoch_cm_metrics] + [epoch_metrics]

## Plot metrics

In [None]:
import plotly.graph_objects as go


for epoch_num in range(1,6):
    
    names = list(epoch_stats["epoch_" + str(epoch_num)][1].keys())
    
    # Create traces
    fig = go.Figure()

    for name in names:
        fig.add_trace(go.Scatter(x=thresholds, 
                                 y=epoch_stats["epoch_" + str(epoch_num)][1][name],
                                 mode='lines+markers',
                                 name=name))

    # Edit the layout
    fig.update_layout(title='Metrics Vs Threshold epoch ' + str(epoch_num),
                       xaxis_title='Thresholds',
                       yaxis_title='Metric score [0,1]')

    fig.show()

## Plot CM metrics

In [None]:
import plotly.graph_objects as go


for epoch_num in range(1,6):
    
    names = list(epoch_stats["epoch_" + str(epoch_num)][0].keys())
    
    # Create traces
    fig = go.Figure()

    for name in names:
        fig.add_trace(go.Scatter(x=thresholds, 
                                 y=epoch_stats["epoch_" + str(epoch_num)][0][name],
                                 mode='lines+markers',
                                 name=name))

    # Edit the layout
    fig.update_layout(title='CM Metrics Vs Threshold epoch ' + str(epoch_num),
                       xaxis_title='Thresholds',
                       yaxis_title='Metric count')

    fig.show()

# Evaluation after choosing threshold

In [None]:
filenames = [
        "../Data/genie_datasets/DBN_predictions/all_var_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_train_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_valid_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_UCLA_test_with_DBN_predictions.csv",
        "../Data/genie_datasets/DBN_predictions/all_var_model/Prov_cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_using_UCLA_discritizer_with_DBN_predictions.csv"
]

df_all_results = pd.DataFrame()


for filename in tqdm(filenames):
    
    df_valid = pd.read_csv(filename, low_memory=False)
    
    filename = filename.replace("../Data/genie_datasets/DBN_predictions/all_var_model/cure_ckd_egfr_registry_preprocessed_project_preproc_data_discretized_","")\
                        .replace("../Data/genie_datasets/DBN_predictions/all_var_model/","")\
    
    print(filename.split('/')[-1])
    
    num_of_epochs = 5
    
    cols = ["Dataset","Metric"]+["Prediction Year " + str(epoch_num + 1) + ", target year " + str(epoch_num_targ+1) \
                                 for epoch_num in range(num_of_epochs) for epoch_num_targ in range(num_of_epochs) \
                      if epoch_num <= epoch_num_targ]
    
    auc_rocs, ap_aucs = [filename,"AUC ROC"],[filename,"AP"]
    
    tns, fps, fns, tps = [filename,"TNs"],[filename,"FPs"],[filename,"FNs"],[filename,"TPs"]
    precisions, recalls = [filename,"Precision/PPV"],[filename,"Recall/Sensitivity"]
    specificities, f1Scores = [filename,"Specificity"],[filename,"F1 score"]
    
    
    filenames_epochs = []

    # looping over epochs
    for epoch_num in tqdm(range(0, num_of_epochs)):
        for epoch_num_targ in range(0, num_of_epochs):
            if epoch_num <= epoch_num_targ:
                print(filename)
                print("Prediction Year " + str(epoch_num + 1) + ", target year " + str(epoch_num_targ+1))
                # TODO: for loop for target

                # target variable
                target = "year" + str(1 + epoch_num_targ) + "_reduction_40_ge"
                truth = df_valid[target].str.replace("S_", "").astype(int).values
                predictions = df_valid["predictions_year" + str(epoch_num + 1)]

                auc_roc = roc_auc_score(truth,predictions)
                ap_score = average_precision_score(truth,predictions)
                auc_rocs.append(auc_roc)
                ap_aucs.append(ap_score)

                # testing optimal threshold
                threshold = 0.14
                preds = (predictions>threshold)*1
                tn, fp, fn, tp = confusion_matrix(truth, preds).ravel()

                tns.append(tn)
                fps.append(fp) 
                fns.append(fn) 
                tps.append(tp)

                # aucs
                print("AUC ROC: ")
                print(auc_roc)
                print("AP: ")
                print(ap_score)

                # confusion matrices
                print("TN, FP, FN, TP :")
                print(tn, fp, fn, tp)

                precision = precision_score(truth,preds)
                recall = recall_score(truth,preds)
                specificity = float(tn) / float(tn+fp)
                f1Score = f1_score(truth, preds)

                precisions.append(precision)
                recalls.append(recall) 
                specificities.append(specificity) 
                f1Scores.append(f1Score)

                # metrics
                print("Precision, Recall, Spec, F1 score :")
                print(precision,recall,specificity,f1Score)
            else:
                pass

    metrics = [auc_rocs, ap_aucs, tns, fps, fns, tps, precisions, recalls, specificities, f1Scores]
    df_results = pd.DataFrame(data=metrics,columns=cols)
        
    df_all_results = df_all_results.append(df_results)
    
    print('\n')

In [None]:
df_all_results.to_csv("../Data/genie_datasets/DBN_predictions/Results/full_model_results.csv",index=False)