In [1]:
# Initialization
%cd ../covid_households/
import recipes
import utilities
import traits

import tqdm
from multiprocessing import Pool

/Users/thayer/covid_households/covid_households


# What is this notebook for?

We conduct our simulations over a range of values for each of our three different parameters, $s_{80}$, $p_{80}$, and $\text{SAR}$. But these parameters are not direct inputs into our model, they are complex expressions of properties of the distributions of relative susceptibility and infectivity ($s_{80}$ and $p_{80}$) in the population or the average risk of infection from a household contact ($\text{SAR}$). For full information about these parameters, see the Methods and Supplemental Methods sections.

To convert these parameters to actual model parameters (the mean & variance of distributions; or $\beta$, the probability/time of infection) we use numerical methods. For the overwhelming majority of parameter combinations, this works great. But when $p_{80}$ or $s_{80}$ is small and $\text{SAR}$ is high, we cannot solve for a $\beta$ that actually produces the desired $\text{SAR}$. There is so much heterogeneity (and thus so many people that are neglibly infectious or susceptible) that we can't solve for an appropriately high $\beta$ given that $\beta < 1$.

We want to drop these points of our 3d grid in parameter space so that the likelihood surface does not include points with an unrealistic $\beta$. To do that, we first have to find every point where the residual from the numerical fit is higher than our tolerance ($10^{-5}$).

We define the region over which we simulate by enumerating each of its axes. In order to compute in parallel, we also make a `coordinate_stream` generator that yields coordinate pairs for the entire region in sequence.

In [3]:
import numpy as np
s80_axis = np.linspace(0.10, 0.80, 36)
p80_axis = np.linspace(0.10, 0.80, 36)
sar_axis = np.linspace(0.05, 0.60, 56)

def coordinate_stream(axis1, axis2, axis3):
    for v1 in axis1:
        for v2 in axis2:
            for v3 in axis3:
                yield (v1, v2, v3)

Using Python's multiprocessing functionality, we iterate over each point in the region and apply the `calculate_residual` function from `utilities` in order to find the difference between the expected $\text{SAR}$ and the $\text{SAR}$ that is actually implied by $\beta$ and the traits.

In [5]:
with Pool(4) as p:
    total = len(s80_axis) * len(p80_axis) * len(sar_axis)
    residuals = list(tqdm.tqdm(
        p.imap(utilities.calculate_residual, coordinate_stream(s80_axis, p80_axis, sar_axis)),
        total=total
    ))

100%|█████████████████████████████████████| 72576/72576 [23:49<00:00, 50.77it/s]


In [7]:
beta_crib_copy = utilities.S80_P80_SAR_Inputs.beta_crib.copy()
beta_crib_copy['residuals'] = residuals

In [9]:
beta_crib_copy['bad beta'] = beta_crib_copy['residuals'] > 10e-5

In [17]:
beta_crib_copy[beta_crib_copy['bad beta']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,residuals,bad beta
s80,p80,SAR,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.1,0.10,0.25,0.999993,0.002830,True
0.1,0.10,0.26,0.999999,0.012829,True
0.1,0.10,0.27,0.999999,0.022829,True
0.1,0.10,0.28,0.999999,0.032829,True
0.1,0.10,0.29,0.999998,0.042829,True
...,...,...,...,...,...
0.8,0.14,0.57,0.999999,0.004348,True
0.8,0.14,0.58,0.999997,0.014349,True
0.8,0.14,0.59,0.999999,0.024348,True
0.8,0.14,0.60,0.999998,0.034349,True


In [43]:
beta_crib_copy.to_csv('./problematic_parameter_combinations.csv')

In [23]:
results = recipes.Results.load('/Users/thayer/covid_households/new_parameters/gillespie-s80-p80-SAR')
results.find_frequencies(inplace=True)
keys = results.metadata.parameters

In [27]:
import os
import pyarrow.parquet as pq
root = "/Users/thayer/covid_households/"
empirical_df = os.path.join(root, "empirical/Ontario/empirical_df.parquet")
empirical_df = pq.read_table(empirical_df).to_pandas()
for key in keys:
    empirical_df[key] = 0.
empirical_counts = likelihood.counts_from_empirical(empirical_df, keys, sample_only_keys=[])

In [32]:
import likelihood
frequencies = results.df['frequency'].copy()
logl = likelihood.logl_from_frequencies_and_counts(frequencies, empirical_counts, keys)

In [38]:
try_drop = frequencies.drop(beta_crib_copy[beta_crib_copy['bad beta']].index)

In [40]:
try_drop.loc[0.1, 0.1]

SAR   size  infections
0.05  2     1             0.979075
            2             0.020925
      3     1             0.953415
            2             0.038095
            3             0.008490
                            ...   
0.24  8     4             0.044740
            5             0.055115
            6             0.079085
            7             0.118030
            8             0.156590
Name: frequency, Length: 700, dtype: float64

In [42]:
likelihood.logl_from_frequencies_and_counts(try_drop, empirical_counts, keys).loc[0, 0.1, 0.1]

SAR
0.05   -35625.878801
0.06   -33995.335593
0.07   -32637.403433
0.08   -31483.873504
0.09   -30596.335508
0.10   -29869.141507
0.11   -29225.854729
0.12   -28693.304793
0.13   -28250.902012
0.14   -27889.742978
0.15   -27586.048467
0.16   -27373.443593
0.17   -27194.806803
0.18   -27062.730097
0.19   -26971.971471
0.20   -26925.170734
0.21   -26915.597274
0.22   -26942.499147
0.23   -26996.498818
0.24   -27079.943205
Name: logl, dtype: float64