In [2]:
import pandas as pd
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
import war

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Analysis

Microbiome composition can be shifted by diet. The experiments that have backed this conclusion involve randomly assigning mice into control and case groups, feeding them different diets, and then quantifying the composition of the microbiota in the gut. I want to know whether we are able to use a Bayesian Dirichlet-Multinomial model to quantify the uncertainty surrounding the measured proportions. 

In [5]:
microbiome = pd.read_csv("../datasets/MicrobiomeWithMetadata.csv")
microbiome.head()

Unnamed: 0,Diet,Source,Donor,CollectionMet,Sex,OTU0,OTU1,OTU2,OTU3,OTU4,...,OTU6686,OTU6687,OTU6688,OTU6689,OTU6690,OTU6691,OTU6692,OTU6693,OTU6694,OTU6695
0,0,0,0,0,0,1.56e-11,4.72e-11,1.23e-11,4.52e-11,2.72e-11,...,6.66e-11,3.02e-11,4.42e-11,7.31e-11,7.77e-11,4.33e-11,5.44e-11,8.72e-11,2.71e-11,4.97e-11
1,0,1,0,0,0,2.36e-11,9.53e-11,3.33e-11,2.67e-11,2.02e-11,...,3.26e-12,5.39e-11,4.73e-11,2.6e-11,4.24e-11,6.55e-11,4.85e-11,8.38e-11,3.5e-11,7.62e-11
2,0,2,0,1,0,6.77e-11,3.68e-11,8.02e-11,5.49e-11,1.34e-11,...,7.23e-11,6.3e-12,7.06e-11,8.31e-11,3.31e-11,7.38e-11,4.45e-11,9.65e-12,7.88e-11,3.99e-11
3,0,2,0,0,0,5.52e-11,9.89e-11,4.58e-11,3.54e-11,2.09e-11,...,9.1e-11,2.94e-11,1e-10,9.82e-11,8.54e-11,9.73e-11,2.96e-11,8.58e-13,5.88e-11,5.51e-11
4,0,3,0,0,0,5.24e-11,6.34e-11,2.35e-11,7.47e-11,2.49e-11,...,1.5e-11,4.9e-11,8.11e-12,4.67e-11,8.27e-11,4.63e-11,1.6e-11,5.55e-11,7.84e-11,8.56e-11


The metadata file that's associated with this CSV file has to be re-coded from a CSV file to a YAML file.

In [None]:
from collections import defaultdict

md = defaultdict(dict)  # "md" stands for "metadata dictionary"

sex = ["Male", "Female"]
donor = ["HMouseLFPP", "CONVR", "Human", "Fresh", "Frozen", "HMouseWestern", "CONVD"]
diet = ["LFPP", "Western", "CARBR", "FATR", "Suckling", "Human"]
source = [
    "Cecum1",
    "Cecum2",
    "Colon1",
    "Colon2",
    "Feces",
    "SI1",
    "SI13",
    "SI15",
    "SI2",
    "SI5",
    "SI9",
    "Stomach",
    "Cecum",
]
collection_met = ["Contents", "Scraping"]

for i, s in enumerate(sex):
    md["sex"][i] = s

for i, d in enumerate(donor):
    md["donor"][i] = d

for i, d in enumerate(diet):
    md["diet"][i] = d

for i, s in enumerate(source):
    md["source"][i] = s

for i, c in enumerate(collection_met):
    md["collection_met"][i] = c

In [20]:
import yaml

print(yaml.dump(md))

!!python/object/apply:collections.defaultdict
args: [!!python/name:builtins.dict '']
dictitems:
  collection_met: {0: Contents, 1: Scraping}
  diet: {0: LFPP, 1: Western, 2: CARBR, 3: FATR, 4: Suckling, 5: Human}
  donor: {0: HMouseLFPP, 1: CONVR, 2: Human, 3: Fresh, 4: Frozen, 5: HMouseWestern,
    6: CONVD}
  sex: {0: Male, 1: Female}
  source: {0: Cecum1, 1: Cecum2, 2: Colon1, 3: Colon2, 4: Feces, 5: SI1, 6: SI13,
    7: SI15, 8: SI2, 9: SI5, 10: SI9, 11: Stomach, 12: Cecum}



In [21]:
with open("datasets/MicrobiomeMetadataDictionary.yml", "w+") as f:
    f.write(yaml.dump(md))

In [23]:
with open("datasets/MicrobiomeMetadataDictionary.yml", "r+") as f:
    metadata = yaml.load(f)

metadata

defaultdict(dict,
            {'collection_met': {0: 'Contents', 1: 'Scraping'},
             'diet': {0: 'LFPP',
              1: 'Western',
              2: 'CARBR',
              3: 'FATR',
              4: 'Suckling',
              5: 'Human'},
             'donor': {0: 'HMouseLFPP',
              1: 'CONVR',
              2: 'Human',
              3: 'Fresh',
              4: 'Frozen',
              5: 'HMouseWestern',
              6: 'CONVD'},
             'sex': {0: 'Male', 1: 'Female'},
             'source': {0: 'Cecum1',
              1: 'Cecum2',
              2: 'Colon1',
              3: 'Colon2',
              4: 'Feces',
              5: 'SI1',
              6: 'SI13',
              7: 'SI15',
              8: 'SI2',
              9: 'SI5',
              10: 'SI9',
              11: 'Stomach',
              12: 'Cecum'}})

In [24]:
set(microbiome["Diet"].values)

{0, 1, 2, 3, 4, 5}

In [25]:
set(microbiome["Source"].values)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

In [26]:
otu_cols = [c for c in microbiome.columns if "OTU" in c]

In [None]:
with pm.Model() as dirichlet_model:
    mu = pm.HalfNormal("mu", sd=100 ** 2)
    n_seq_reads = pm.Poisson("n_seq_reads", mu=mu, observed=healthy_reads.sum(axis=1))
    proportions = pm.Dirichlet("proportions", a=np.ones(3), shape=(3,))
    for i in range(healthy_reads.shape[0]):
        draws = pm.Multinomial(
            f"draws_{i}",
            n=healthy_reads[i].sum(),
            p=proportions,
            observed=healthy_reads[i, :],
        )
    dirichlet_trace = pm.sample(draws=2000)
    pm.traceplot(dirichlet_trace)