## How well do volunteers do against the aggregate model?

Note this requires the `volunteer_skill_exploration.ipynb` notebook to have been on each `subject_id` and the output notebooks to be saved to `/volunteer_losses`.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import json
import numpy as np
import scipy.stats as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from gzbuilder_analysis.parsing import unmake_json
from gzbuilder_analysis.fitting import Model, loss
import lib.galaxy_utilities as gu
import scrapbook as sb

In [None]:
fitted_models = pd.read_pickle('lib/fitted_models.pickle')

In [None]:
agg_losses = fitted_models.agg_loss

In [None]:
nbs = sb.read_notebooks('volunteer_losses')
loss_df = pd.concat([
  pd.Series(d.get('losses', np.nan)).rename(d.get('subject_id', np.nan))
  for d in (j.scraps.data_dict for i, j in nbs.items())
], axis=1).T

In [None]:
p_better_than_agg = pd.Series([])
for name in loss_df.columns:
  res = loss_df[name].dropna()
  if len(res) > 0:
    better = (res - agg_losses.reindex(res.index) < 0)
    p_better_than_agg.loc[name] = (better < 0).astype(int).sum() / len(res)

$$\mathrm{Ability} = \beta(1 + N_\mathrm{better}, 1 + N_\mathrm{classifications} - N_\mathrm{better})$$

In [None]:
x = np.linspace(0, 1, 500)
five_percentile = pd.Series([])
ninety_five_percentile = pd.Series([])
means = pd.Series([])
medians = pd.Series([])
plt.figure(figsize=(16, 4), dpi=100)
for name in loss_df.columns:
  res = loss_df[name].dropna()
  if len(res) >= 10:
    better = (res - agg_losses.reindex(res.index) < 0)
    dist = st.beta.pdf(x, 1 + better.sum(), 1 + len(res) - better.sum())
    rvs = st.beta.rvs(1 + better.sum(), 1 + len(res) - better.sum(), size=10000)
    dist_median = st.beta.median(1 + better.sum(), 1 + len(res) - better.sum())
    medians.loc[name] = st.beta.median(1 + better.sum(), 1 + len(res) - better.sum())
    means.loc[name] = st.beta.mean(1 + better.sum(), 1 + len(res) - better.sum())
    five_percentile.loc[name] = np.percentile(rvs, 5)
    ninety_five_percentile.loc[name] = np.percentile(rvs, 95)
    plt.fill_between(x, 0, dist, alpha=0.05, color='k')
plt.xlim(0, 1);

In [None]:
plt.figure(figsize=(16, 4), dpi=100)
for i, name in enumerate(('klmasters', 'tingard', 'ElisabethB')):
  res = loss_df[name].dropna()
  c = 'C{}'.format(i)
  better = (res - agg_losses.reindex(res.index) < 0)
  dist = st.beta.pdf(x, 1 + better.sum(), 1 + len(res) - better.sum())
  plt.fill_between(x, 0, dist, alpha=0.2, color=c)
  plt.plot(x, dist, c, linewidth=0.5, alpha=1, label='{} ({} classifications)'.format(name, len(res)))
plt.legend()

In [None]:
sns.kdeplot(ninety_five_percentile, shade=True, label='95% upper bound')
sns.kdeplot(medians, shade=True, label='medians')
sns.kdeplot(means, shade=True, label='means')
ninety_five_percentile.describe()

So what does this all mean? It means that the best individual model consistently outperforms the aggregate (around 70% of the time). However, for any individual volunteer, we can say with greater than 95% confidence that their model will be worse than the tuned aggregate model more than half the time. 