In [None]:
import yaml
import pandas as pd
import numpy as np
from IPython.core.display import display, Markdown

In [None]:
nm_info_path = 'output/details/nm_info.yaml'

In [None]:
info = yaml.load(
    open(nm_info_path),
    Loader=yaml.SafeLoader
)
metadata = info['metadata']
stats = info['stats']

In [None]:
if not metadata['params']['incremental']:
    prediction_stats = stats['FitModel']['model_stats__match']['basic']['all pairs']
    key_stats = {
        'Blocking: pair completeness': stats['Block']['pc_eq_cosed'],
        'Blocking: pair completeness (excluding exact matches)': stats['Block']['pc_neq_cosed'],
        'Prediction: baserate, or P(match) in ground truth data': prediction_stats['baserate'],
        'Prediction: precision': prediction_stats['precision'],
        'Prediction: recall': prediction_stats['recall'],
        'Prediction: AUC': prediction_stats['auc'],
        'Prediction: false positive rate': prediction_stats['fp_rate'],
        'Prediction: false negative rate': prediction_stats['fn_rate']
    }

n_valid = stats['ProcessInputData']['n_valid_an']
key_ns = {
    'Number of records': stats['ProcessInputData']['n_an'],
    'Number of records with required values': n_valid,
    'Number of possible record pairs': (n_valid * (n_valid-1)) / 2,
    'Number of candidate record pairs (record pairs considered post blocking)': stats['Block'].get('n_cand_pairs', np.NaN),
    'Number of potential links (candidate record pairs predicted to match)': stats['Cluster']['n_potential_edges'],
    'Number of potential links deemed invalid by pair-level constraints': stats['Cluster']['n_invalid_links'],
    'Number of potential links deemed invalid by cluster-level constraints': stats['Cluster']['n_invalid_clusters'],
    'Number of final clusters': stats['Cluster']['n_clusters'],
    'Number of final clusters with no links (i.e. one-record clusters)': stats['Cluster']['n_singleton_clusters']
}
if metadata['params']['incremental']:
    _ = key_ns.pop('Number of candidate record pairs (record pairs considered post blocking)')

# Key performance metrics

Note: Prediction performance metrics (precision, recall, AUC, false positive rate, and false negative rate) are reported out of sample. That is, they are computed on *heldout* ground truth data not used in model training.  

In [None]:
if not metadata['params']['incremental']:
    display(pd.DataFrame(pd.Series(key_stats).rename("")).style.format('{:.1%}'))
else:
    display(Markdown("<font color='red'>Key performance metrics are not yet supported for incremental Name Match runs.</font>"))

# Important counts: understanding how final links are made

In [None]:
pd.DataFrame(pd.Series(key_ns).rename("")).style.format('{:,.0f}')