In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
import numpy.random as npr

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1080 (0000:01:00.0)


# Introduction

Let's say there are three bacteria species that characterize the gut, and we hypothesize that they are ever so shifted off from one another, but we don't know how (i.e. ignore the data-generating distribution below). Can we figure out the proportion parameters and their uncertainty?

# Generate Synthetic Data

In the synthetic dataset generated below, we pretend that every patient is one sample, and we are recording the number of sequencing reads corresponding to some OTUs (bacteria). Each row is one sample (patient), and each column is one OTU (sample).

## Proportions

Firstly, let's generate the ground truth proportions that we will infer later on.

In [2]:
def proportion(arr):
    arr = np.asarray(arr)
    return arr / arr.sum()


healthy_proportions = proportion([10, 16, 2])
healthy_proportions

array([ 0.35714286,  0.57142857,  0.07142857])

In [3]:
sick_proportions = proportion([10, 27, 15])
sick_proportions

array([ 0.19230769,  0.51923077,  0.28846154])

## Data

Now, given the proportions, let's generate data. Here, we are assuming that there are 10 patients per cohort (10 sick patients and 10 healthy patients), and that the number of counts in total is 50.

In [4]:
n_data_points = 10


def make_healthy_multinomial(arr):
    n_sequencing_reads = 50  # npr.poisson(lam=50)
    return npr.multinomial(n_sequencing_reads, healthy_proportions)


def make_sick_multinomial(arr):
    n_sequencing_reads = 50  # npr.poisson(lam=50)
    return npr.multinomial(n_sequencing_reads, sick_proportions)


# Generate healthy data
healthy_reads = np.zeros((n_data_points, 3))
healthy_reads = np.apply_along_axis(make_healthy_multinomial, axis=1, arr=healthy_reads)

# Generate sick reads
sick_reads = np.zeros((n_data_points, 3))
sick_reads = np.apply_along_axis(make_sick_multinomial, axis=1, arr=sick_reads)

In [5]:
# Make pandas dataframe
healthy_df = pd.DataFrame(healthy_reads)
healthy_df.columns = ["bacteria1", "bacteria2", "bacteria3"]
healthy_df = pm.floatX(healthy_df)

sick_df = pd.DataFrame(sick_reads)
sick_df.columns = ["bacteria1", "bacteria2", "bacteria3"]
sick_df = pm.floatX(sick_df)

In [6]:
healthy_df.dtypes

bacteria1    float32
bacteria2    float32
bacteria3    float32
dtype: object

In [7]:
sick_df.dtypes

bacteria1    float32
bacteria2    float32
bacteria3    float32
dtype: object

# Model Construction

Here's an implementation of the model - Dirichlet prior with Multinomial likelihood.

There are 3 classes of bacteria, so the Dirichlet distribution serves as the prior probability mass over each of the classes in the multinomial distribution.

The multinomial distribution serves as the likelihood function.

In [8]:
with pm.Model() as dirichlet_model:
    proportions_healthy = pm.Dirichlet(
        "proportions_healthy",
        a=np.array([1.0] * 3).astype("float32"),
        shape=(3,),
        testval=[0.1, 0.1, 0.1],
    )
    proportions_sick = pm.Dirichlet(
        "proportions_sick",
        a=np.array([1.0] * 3).astype("float32"),
        shape=(3,),
        testval=[0.1, 0.1, 0.1],
    )
    healthy_like = pm.Multinomial(
        "like_healthy", n=50, p=proportions_healthy, observed=healthy_df.values
    )
    sick_like = pm.Multinomial(
        "like_sick", n=50, p=proportions_sick, observed=sick_df.values
    )

## Sampling

In [9]:
with dirichlet_model:
    dirichlet_trace = pm.sample(draws=10000, start=pm.find_MAP(), step=pm.Metropolis())
    pm.traceplot(dirichlet_trace)

ValueError: ('The following error happened while compiling the node', GpuElemwise{Composite{(i0 + log((Composite{inv(Cast{float32}((i0 - i1)))}(i1, i2) / (i3 - Composite{inv(Cast{float32}((i0 - i1)))}(i1, i2)))))}}[(0, 0)]<gpuarray>(GpuFromHost<None>.0, GpuElemwise{Add}[(0, 1)]<gpuarray>.0, GpuFromHost<None>.0, GpuArrayConstant{[ 1.]}), '\n', 'Cannot compute test value: input 0 (<int64>) of Op Composite{inv(Cast{float32}((i0 - i1)))}(<int64>, <int64>) missing default value. ')

# Results

In [None]:
pm.forestplot(
    dirichlet_trace,
    ylabels=[
        "healthy_bacteria1",
        "healthy_bacteria2",
        "healthy_bacteria3",
        "sick_bacteria1",
        "sick_bacteria2",
        "sick_bacteria3",
    ],
)

In [None]:
healthy_proportions, sick_proportions

They match up with the original synthetic percentages!