<a href="https://colab.research.google.com/github/tanderson11/covid_households/blob/main/SuperspreadingLab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Superspreading Laboratory

# Drive + Github configuration

* If you opened this notebook from Github, go to `File > Save a copy in Drive`.
* If running for the first time, set `first_time_setup = True`
* You'll have to authenticate on google drive the first time and each time the runtime times out.
* To use tokens, fork the repository and go to `Github > User > Settings > Developer Settings > Personal access tokens`

In [None]:
# A flag to clone the repository into your drive. See the >>> SETUP <<< lines to know what is affected
# Set to false after running this set of cells once
first_time_setup = False

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%mkdir /content/gdrive/My\ Drive/github
%cd /content/gdrive/My\ Drive/github

mkdir: cannot create directory ‘/content/gdrive/My Drive/github’: File exists
/content/gdrive/My Drive/github


In [None]:
# If you've forked the repository, point to your own username and repository name (if different)
repo_owner="tanderson11"
repository="covid_households"

# >>> SETUP: <<<
if first_time_setup:
    !git clone https://github.com/{repo_owner}/{repository}.git

# >>> TOKEN SETUP: <<<
# If you're using token authentication (recommended), add your token and uncomment the lines below instead

#!echo git_token=\"your_token_here\" > git_token.py
#from git_token import git_token

#!git clone https://{git_token}@github.com/{repo_owner}/{repository}

In [None]:
%cd covid_households/
!ls -a

# >>> TOKEN SETUP: <<<
# this will put your token in the right folder; comment this line out after use to avoid an error message
#!mv ../git_token.py ./

from git_token import git_token

/content/gdrive/My Drive/github/covid_households
baseline_df.hdf
comparison_df.hdf
constants.py
df.hdf
experiments
forward_simulation.py
.git
.gitignore
git_token.py
inf_var-hsar-seed_one-0.0importation-04-29-17:46
.ipynb_checkpoints
Lab.ipynb
likelihood.py
ParameterInferenceLab.ipynb
population.py
__pycache__
README.md
settings.py
SuperspreadingLab.ipynb
sus_var-hsar-seed_one-0.0importation-04-29-16:17
sus_var-hsar-seed_one-0.0importation-04-29-20:02
torch_forward_simulation.py
traits.py
utilities.py
Vaccine.ipynb
vaccine.py


# Pulling upstream changes

Uncomment this block to pull upstream changes from github

In [None]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [None]:
# >>> PULLING CHANGES: <<<
#!git pull

# Initialization

In [9]:
# Initialization
import importlib
import vaccine
import population
import likelihood
import utilities
import pandas as pd
import numpy as np
import functools
import matplotlib.pyplot as plt
import traits
import seaborn as sns
import datetime 
import json

# Reload modules

In [None]:
importlib.reload(vaccine)
importlib.reload(likelihood)
importlib.reload(population)
importlib.reload(utilities)
importlib.reload(traits)

<module 'traits' from '/content/gdrive/My Drive/github/covid_households/traits.py'>

# Experiments

## Parameter configuration

In [25]:
# Parameters of the baseline/'empirical' model to vary over
key1 = "inf_var"
key2 = "hsar"
keys = (key1, key2)

sus_var_axis = np.linspace(0.0, 1.8, 19)
inf_var_axis = np.linspace(0.0, 1.8, 19)
hsar_axis = np.linspace(0.15, 0.50, 36)
axes_by_keys = {"sus_var": sus_var_axis, "inf_var":inf_var_axis, "hsar":hsar_axis}

# tiny axes to test functionality
tiny_sus_var_axis = np.linspace(0.7, 0.8, 2)
tiny_inf_var_axis = np.linspace(0.7, 0.8, 2)
tiny_hsar_axis = np.linspace(0.24, 0.25, 2)
tiny_axes_by_keys = {"sus_var": tiny_sus_var_axis, "inf_var":tiny_inf_var_axis, "hsar":tiny_hsar_axis}

# Default parameters for quantities that could vary
default_hsar = 0.3
default_sus = traits.GammaTrait("susceptibility", mean=1.0, variance=0.0)
default_inf = traits.GammaTrait("infectivity", mean=1.0, variance=0.0)

# Static baseline parameters
#duration = 90
#cumulative_import_prob = 0.10
duration=0.
cumulative_import_prob=0.
importation_rate = utilities.importation_rate_from_cumulative_prob(cumulative_import_prob, duration)

seeding=utilities.seed_one_by_susceptibility

baseline_household_sizes = {6:1000}

# never actually used, but dumped to json to preserve defaults (note: defaults, not baseline)
default_model = population.Model("baseline model", household_beta=utilities.household_beta_from_hsar(default_hsar), inf_dist=default_inf, sus_dist=default_sus, importation_rate=importation_rate, duration=duration)



## Functions for sweeping over a grid in parameter space

In [26]:
def make_baseline_df(sus_var, inf_var, hsar):
    _sus_dist = traits.GammaTrait("susceptibility", mean=1.0, variance=sus_var)
    _inf_dist = traits.GammaTrait("susceptibility", mean=1.0, variance=inf_var)

    baseline_model = population.Model("baseline model", household_beta=utilities.household_beta_from_hsar(hsar), inf_dist=_inf_dist, sus_dist=_sus_dist, importation_rate=importation_rate, duration=duration)
    baseline_pop = population.Population(baseline_model, baseline_household_sizes)
    baseline_pop.df["infections"] = baseline_pop.simulate_population()
    return baseline_pop.df

def make_comparison_df(sus_var, inf_var, hsar):
    household_sizes = {size:25000 for size in set(baseline_household_sizes.keys())} # a number at each unique size in the baseline population
    _sus_dist = traits.GammaTrait("susceptibility", mean=1.0, variance=sus_var)
    _inf_dist = traits.GammaTrait("susceptibility", mean=1.0, variance=inf_var)
    comparison_model = population.Model("comparison model", inf_dist=_inf_dist, sus_dist=_sus_dist, importation_rate=importation_rate, duration=duration, household_beta=utilities.household_beta_from_hsar(hsar))
    comparison_pop = population.Population(comparison_model, household_sizes)
    comparison_pop.df["infections"] = comparison_pop.simulate_population()
    return comparison_pop.df

def run_grid(key1, key2, axes_by_keys, make_df):
    arguments = {"sus_var":default_sus.variance, "inf_var":default_inf.variance, "hsar":default_hsar}

    dfs = []
    for y in axes_by_keys[key1]:
        for x in axes_by_keys[key2]:
            arguments[key1] = y
            arguments[key2] = x
            df = make_df(**arguments)

            for k,v in arguments.items():
                df[k] = np.float("{0:.2f}".format(v))

            dfs.append(df)

    full_df = pd.concat(dfs)
    return full_df

## Running and writing

Outputs written to `experiments/` inside a directory with name based on the experiment conducted.

In [None]:
baseline_df = run_grid(key1, key2, axes_by_keys, make_baseline_df)
baseline_df

In [None]:
# All this code is accessory to making a folder and saving the results of the experiment

if default_model.importation_rate > 0:
    importation_str = "importation"
else:
    importation_str = "no_importation"

if "hsar" in [key1, key2]:
    hsar_str = ""
else:
    hsar_str = "-hsar{}-".format("{:.2f}".format(default_hsar).replace(".", ""))

name = "{}-{}{}-{}-{}".format(key1, key2, hsar_str, default_model.seeding.name, importation_str)
date_str = datetime.datetime.now().strftime("%m-%d-%H:%M")
directory_name = "{}-{}".format(name, date_str)

print(directory_name)

!mkdir ./experiments/{directory_name}
baseline_df.to_hdf('./experiments/{0}/baseline_df.hdf'.format(directory_name), key='baseline_df', mode='w')

with open('./experiments/{0}/default_model.json'.format(directory_name), 'w') as handle:
    handle.write(default_model.to_json())

with open('./experiments/{0}/keys.json'.format(directory_name), 'w') as handle:
    json.dump(keys, handle)

In [None]:
full_comparison_df = run_grid(key1, key2, axes_by_keys, make_comparison_df)
full_comparison_df.to_hdf('./experiments/{0}/comparison_df.hdf'.format(directory_name), key='full_comparison_df', mode='w')

# Procedure for estimating likelihoods

## Basic multinomial likelihood

Suppose that we make an observation of infections in a population of $n$ households, all of size $s$, where $y_k$ gives the number of households that exhibited $k$ infections.

Suppose additionally, that $P$ is the set of true probabilities such that $p_{k}$ gives the probability that a household of size $s$ would yield $k$ infections at fixation. 

We want to know the (log) likelihood that we would observe each $y_k$ under the model $P$. In familiar notation, we want to determine $\mathcal{L} (Y | P) = \mathcal{L} (P | Y)$. That likelihood is given as follows:

$$\mathcal{L} (Y | P) = \prod_k \left(p_{k} \right)^{y_{k}}$$

Taking the $\log$ of both sides:

\begin{align*}
    \log \left(\mathcal{L} (Y | P)\right) = \sum_k \left(y_k \log p_k\right)
\end{align*}

## In our context

Its our challenge to estimate the set of probabilities $P$. To do this, we choose some model of infection (see the README for a model description) and then simulate the outcomes across a large pool of identical households. 

From this bank of simulated data, we can identify the frequencies with which there are $k$ infections in a given household for all $k$. Provided that we simulate a large number of households, these frequencies give the probabilities $P$ -- conditioned on the infection model -- that a household of size $s$ experiences $k$ infections.

In this way, given two tables of data corresponding to an empirical observation and a large set of simulated households, where each row represents a household and contains the information about size and infections, we can assign a likelihood to to the observation using the simulated households with the multinomial likelihood discussed above.

## More complex cases

What if not every household has the same size? What if individuals are not identical?

These are two natural extensions of the work above, and they are both nearly as easy thanks to the independence of households.

# Making a git commit from changes to python backend

To save this notebook use File > Save a copy in Github

In [None]:
!git add utilities.py
!git add population.py

In [None]:
!git add -A
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git reset HEAD <file>..." to unstage)

	[32mnew file:   baseline_df.hdf[m
	[32mnew file:   comparison_df.hdf[m
	[32mnew file:   df.hdf[m
	[32mnew file:   experiments/sus_var-hsar-seed_one-0.0importation-04-29-20:25/baseline_df.hdf[m
	[32mnew file:   experiments/sus_var-hsar-seed_one-0.0importation-04-29-20:25/baseline_model.pickle[m
	[32mnew file:   experiments/sus_var-hsar-seed_one-0.0importation-04-29-20:25/comparison_df.hdf[m
	[32mnew file:   inf_var-hsar-seed_one-0.0importation-04-29-17:46/baseline_df.hdf[m
	[32mnew file:   inf_var-hsar-seed_one-0.0importation-04-29-17:46/baseline_model.pickle[m
	[32mnew file:   inf_var-hsar-seed_one-0.0importation-04-29-17:46/comparison_df.hdf[m
	[32mmodified:   population.py[m
	[32mnew file:   sus_var-hsar-seed_one-0.0importation-04-29-16:17/baseline_df.hdf[m
	[32mnew file:   sus_var-hsar-seed_one-0.0importation-04-29-16:17/compar

In [None]:
!git config --global user.email ""
!git config --global user.email "Thayer"

In [None]:
message = input("Commit message? ")

!git commit -m "{message}"

 2 files changed, 16 insertions(+), 1 deletion(-)


# Pushing upstream

In [None]:
!git remote set-url origin https://{git_token}@github.com/{repo_owner}/{repository}

In [None]:
!git pull

remote: Enumerating objects: 18, done.[K
remote: Counting objects:   5% (1/18)[Kremote: Counting objects:  11% (2/18)[Kremote: Counting objects:  16% (3/18)[Kremote: Counting objects:  22% (4/18)[Kremote: Counting objects:  27% (5/18)[Kremote: Counting objects:  33% (6/18)[Kremote: Counting objects:  38% (7/18)[Kremote: Counting objects:  44% (8/18)[Kremote: Counting objects:  50% (9/18)[Kremote: Counting objects:  55% (10/18)[Kremote: Counting objects:  61% (11/18)[Kremote: Counting objects:  66% (12/18)[Kremote: Counting objects:  72% (13/18)[Kremote: Counting objects:  77% (14/18)[Kremote: Counting objects:  83% (15/18)[Kremote: Counting objects:  88% (16/18)[Kremote: Counting objects:  94% (17/18)[Kremote: Counting objects: 100% (18/18)[Kremote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects:  11% (1/9)[Kremote: Compressing objects:  22% (2/9)[Kremote: Compressing objects:  33% (3/9)[Kremote: Compressing objects:  44%

In [None]:
# if there was a merge
#!git status
#!git commit -m "Merged"

On branch main
Your branch and 'origin/main' have diverged,
and have 1 and 6 different commits each, respectively.
  (use "git pull" to merge the remote branch into yours)

All conflicts fixed but you are still merging.
  (use "git commit" to conclude merge)

Changes to be committed:

	[32mmodified:   .gitignore[m
	[32mmodified:   Lab.ipynb[m
	[32mnew file:   SuperspreadingLab.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31minf_var-hsar-seed_one-0.0importation-04-29-17:46/[m

[main 5fabc36] Merged


In [None]:
!git status

On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31minf_var-hsar-seed_one-0.0importation-04-29-17:46/[m

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
#from git_token import git_token
#!git remote set-url origin https://{git_token}@github.com/{username}/{repository}

!git push

Counting objects: 6, done.
Delta compression using up to 2 threads.
Compressing objects:  16% (1/6)   Compressing objects:  33% (2/6)   Compressing objects:  50% (3/6)   Compressing objects:  66% (4/6)   Compressing objects:  83% (5/6)   Compressing objects: 100% (6/6)   Compressing objects: 100% (6/6), done.
Writing objects:  16% (1/6)   Writing objects:  33% (2/6)   Writing objects:  50% (3/6)   Writing objects:  66% (4/6)   Writing objects:  83% (5/6)   Writing objects: 100% (6/6)   Writing objects: 100% (6/6), 909 bytes | 454.00 KiB/s, done.
Total 6 (delta 4), reused 0 (delta 0)
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/tanderson11/covid_households
   95e3a2c..5fabc36  main -> main
