# Evaluate model significance
We test models to see if they are significantly different from one another. We use [this reference](https://engineering.purdue.edu/kak/SignificanceTesting.pdf). 

This notebook could later be improved to do all pairwise comparisons and compare all against random. In general, it seems that all are better than random. This notebook is written as delta mAP as the test statistic, though in practice, we also considered AP@1-5.

In [1]:
import numpy as np
import pandas as pd
import pickle

def read_pickle(handle):
    return pickle.load(open(handle, "rb"))

models = read_pickle('tables/models.evaluated.pkl')
models_random = read_pickle('tables/models.random.evaluated.pkl')

In [2]:
models.keys()

dict_keys(['jaccard-almanac-genes', 'jaccard-almanac-feature-types', 'jaccard-almanac-features', 'jaccard-cgc-genes', 'jaccard-cgc-feature-types', 'compatibility', 'nonsynonymous-variant-count', 'pca-almanac-genes', 'pca-cgc-genes', 'multi-pass-sort_fda-cgc', 'snf_fda-cgc-genes', 'snf_cgc', 'snf_fda-cgc', 'snf_almanac', 'somatic-tree'])

In [3]:
models_random.keys()

dict_keys(['random_mean', 'random_plus_one_std', 'random_minus_one_std'])

In [4]:
def compare_two_models(case_dict, comparison_dict, case_label, comparison_label, N):
    case_mAP = case_dict['mean_average_precision']
    comparison_mAP = comparison_dict['mean_average_precision']
    delta_mAP = case_mAP - comparison_mAP
    
    aps = pd.concat([
        case_dict['average_precision'].rename(case_label),
        comparison_dict['average_precision'].rename(comparison_label)
    ], axis=1)
    aps['shuffle_case'] = 0
    aps['shuffle_comparison'] = 0
    
    delta_mAPs = pd.Series(index=range(0, N), dtype=float)
    for seed in delta_mAPs.index:
        rng = np.random.default_rng(seed=seed)
        aps['rng'] = rng.binomial(1, 0.5, aps.shape[0])

        aps.loc[aps['rng'].eq(1), 'shuffle_case'] = aps.loc[aps['rng'].eq(1), case_label]
        aps.loc[aps['rng'].eq(1), 'shuffle_comparison'] = aps.loc[aps['rng'].eq(1), comparison_label]
        aps.loc[aps['rng'].eq(0), 'shuffle_case'] = aps.loc[aps['rng'].eq(0), comparison_label]
        aps.loc[aps['rng'].eq(0), 'shuffle_comparison'] = aps.loc[aps['rng'].eq(0), case_label]
    
        shuffled_case_mAP = aps['shuffle_case'].mean()
        shuffled_comparison_mAP = aps['shuffle_comparison'].mean()
        shuffled_delta_mAP = shuffled_case_mAP - shuffled_comparison_mAP
        delta_mAPs.loc[seed] = shuffled_delta_mAP
    return delta_mAP, delta_mAPs

In [5]:
case = 'snf_fda-cgc'
comparison = 'random_mean' # random_mean

case_model = models[case]
#comparison_model = models[comparison]
comparison_model = models_random[comparison] # models_random[comparison]

delta_mAP, series = compare_two_models(case_model, comparison_model, case, comparison, 10000)
series_value_counts = series.abs().ge(abs(delta_mAP)).value_counts()
if True in series_value_counts.index:
    pvalue = series_value_counts[True] / series.shape[0]
else:
    pvalue = 0

print(f"{case} AP@1: {case_model['ap@k'][1]}")
print(f"{comparison} AP@1: {comparison_model['ap@k'][1]}")
print(f"{case} mAP: {case_model['mean_average_precision']}")
print(f"{comparison} mAP: {comparison_model['mean_average_precision']}")
print(f"delta mAP: {delta_mAP}")
print(f"pvalue: {pvalue}")
print('')

series.abs().ge(abs(delta_mAP)).value_counts()

snf_fda-cgc AP@1: 0.1909814323607427
random_mean AP@1: 0.09549071618037135
snf_fda-cgc mAP: 0.12725321400504938
random_mean mAP: 0.1104324533000405
delta mAP: 0.016820760705008875
pvalue: 0.0048



False    9952
True       48
dtype: int64

In [6]:
list(models.keys())

['jaccard-almanac-genes',
 'jaccard-almanac-feature-types',
 'jaccard-almanac-features',
 'jaccard-cgc-genes',
 'jaccard-cgc-feature-types',
 'compatibility',
 'nonsynonymous-variant-count',
 'pca-almanac-genes',
 'pca-cgc-genes',
 'multi-pass-sort_fda-cgc',
 'snf_fda-cgc-genes',
 'snf_cgc',
 'snf_fda-cgc',
 'snf_almanac',
 'somatic-tree']

In [7]:
all_models = list(models.keys())
df = pd.DataFrame(0, index=all_models, columns=all_models)
for model_row in all_models:
    for model_col in all_models:
        case_model = models[model_row]
        comparison_model = models[model_col]
        #comparison_model = models_random[comparison] # models_random[comparison]

        delta_mAP, series = compare_two_models(case_model, comparison_model, case, comparison, 10000)
        series_value_counts = series.abs().ge(abs(delta_mAP)).value_counts()
        if True in series_value_counts.index:
            pvalue = series_value_counts[True] / series.shape[0]
        else:
            pvalue = 0
        
        df.loc[model_row, model_col] = pvalue

In [8]:
df.to_csv('tables/pairwise-model-comparison.txt', sep='\t', index_label='model-id')