In [None]:
import pandas as pd
from pyprojroot import here
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, auc, roc_curve

from nutils import bootstrap_auc

import warnings
warnings.filterwarnings('ignore')

In [None]:
results = []

for path in Path(here() / 'data/processed/matrices/point').glob('*.csv'):
    result = {}

    
    target = path.stem.split('-')[0]
    model = path.stem.split('-')[1]
    origin = path.stem.split('-')[2]
    fs = path.stem.split('-')[3]
    hpo = path.stem.split('-')[4]
    name = f'{target}-{model}-{origin}-{fs}-{hpo}'
    result['name'] = name

    if target=='cri' or model=='guess':
        continue

    pred_name = f'{target}-{model}-{origin}-{fs}-{hpo}'
    true_name = f'{target}'

    true_path = f'data/processed/true_matrices/{true_name}.csv'
    y_true = pd.read_csv(here() / true_path, parse_dates=True, index_col='Datetime')
    
    y_pred = pd.read_csv(path, parse_dates=True, index_col='Datetime').iloc[:,0]
    idx = y_pred.dropna().index.intersection(y_true.dropna().index)
    
    y_pred = y_pred.loc[idx]
    y_true = y_true.loc[idx]

        
    # F1 
    result['F1'] = f1_score(y_true, y_pred)

    cm = confusion_matrix(y_true, y_pred)

    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]

    # Sensitivity, hit rate, recall, or true positive rate
    result['TPR'] = TP/(TP+FN)
    
    # Specificity or true negative rate
    result['TNR'] = TN/(TN+FP) 
    
    # Precision or positive predictive value
    result['PPV'] = TP/(TP+FP)
    
    # Negative predictive value
    result['NPV'] = TN/(TN+FN)
    
    # Fall out or false positive rate
    result['FPR'] = FP/(FP+TN)
    
    # False negative rate
    result['FNR'] = FN/(TP+FN)
    
    # False discovery rate
    FDR = FP/(TP+FP)

    # Overall accuracy
    result['ACC'] = (TP+TN)/(TP+FP+FN+TN)

    result['FrPos'] = (y_true.sum() / y_true.shape[0]).values[0]

    # AUROC
    y_true = pd.read_csv(here() / f'data/processed/true_matrices/{true_name}.csv', index_col='Datetime')
    y_pred = pd.read_csv(here() / f'data/processed/matrices/prob/{pred_name}.csv', index_col='Datetime')

    df = pd.concat([y_true, y_pred], axis=1).dropna()
    y_true = df.iloc[:,0]
    y_pred = df.iloc[:,1]

    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    _auc = auc(fpr, tpr)

    # CI
    lb, ub = bootstrap_auc(y_true, y_pred)

    result['AUROC'] = f'{_auc:.2f} ({lb:.2f}-{ub:.2f})'

    # PRAUC
    y_true = pd.read_csv(here() / f'data/processed/true_matrices/{true_name}.csv', index_col='Datetime')
    y_pred = pd.read_csv(here() / f'data/processed/matrices/prob/{pred_name}.csv', index_col='Datetime')

    df = pd.concat([y_true, y_pred], axis=1).dropna()
    y_true = df.iloc[:,0]
    y_pred = df.iloc[:,1]

    precision, recall, _ = precision_recall_curve(y_true, y_pred, pos_label=1)
    _auc = auc(recall, precision)

    lb, ub = bootstrap_auc(y_true, y_pred)

    result['PRAUC'] = f'{_auc:.2f} ({lb:.2f}-{ub:.2f})'

    results.append(result)

In [None]:
table = pd.DataFrame(results).round(2)
table = table.sort_values(by='name')

In [None]:
table[['Target', 'Model', 'Origin', 'fs', 'hpo']] = table.name.str.split('-', expand=True)

In [None]:
table = table[['Target', 'Model', 'Origin', 'F1', 'TPR', 'TNR', 'PPV', 'NPV', 'FPR', 'FNR', 'ACC', 'AUROC', 'PRAUC']]

In [None]:
table.Origin = table.Origin.astype(int)
table = table.sort_values(by=['Target', 'Origin'])

In [None]:
table.Target = table.Target.str.capitalize()

In [None]:
table.Target = table.Target.replace({'Bed':'Bedoccupying', 'Med':'Medical', 'Sur':'Surgical'})

In [None]:
latex = table.to_latex(
    buf=here() / 'output/tables/metrics_all.tex',
    float_format='%.2f',
    position='H',
    index=False,
    label='tab:metrics_all',
    caption='''
    Performance of XGBoost, CatBoost and LightGBM in relation to one another.
    '''
)