In this notebook we experiment with implementing Latent Credible Analysis models. Let's build the most simpleLCA

In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
from tensorflow_probability import edward2 as ed
import tensorflow_probability as tfp

In [3]:
tf.__version__, tfp.__version__

('2.1.0', '0.9.0')

In [4]:
def lca_model(claims):
    """Build a Latent Credibility Analysis (LCA).
   
    A LCA model represents a joint distribution p(Y, H, X), where
    Y represents hidden truth rvs, H represents data source honesty, and X
    represents observation.

    Concretely, let's assume that we have M objects, S data sources, and our
    observation will be the mask matrix W, `mask`, (see build_mask()), and
    observation matrix `observation` (see build_observation()).

    With this context, we have:
        p(Y, H, X) = product_{m=1,..,M}[p(y_m, H, X)], where
        p(y_m, H, X) = p(y_m)product_{s in S_m}[p(b_sm|y_m,s)p(s)],
        where S_m are the set of sources that make claims about an object m.

    Parameters
    ----------
    claims: pd.DataFrame
        a data frame that has columns [source_id, object_id, value]

    """
    problem_sizes = claims.nunique()
    n_sources = problem_sizes['source_id']
    n_objects = problem_sizes['object_id']
    domain_size = claims.groupby('object_id').max()['value'] + 1
    # create honest rv, H_s, for each sources
    honest = []
    for s in pyro.plate(name='sources', size=n_sources):
        honest.append(
            pyro.sample(
                f's_{s}',
                dist.Categorical(
                    probs=pyro.param(f'theta_s_{s}',
                                     init_tensor=_draw_probs(),
                                     constraint=constraints.simplex))))

    # creat hidden truth rv for each object m
    hidden_truth = []
    for m in pyro.plate(name='objects', size=n_objects):
        hidden_truth.append(
            pyro.sample(
                f'y_{m}',
                dist.Categorical(probs=pyro.param(
                    f'theta_m_{m}',
                    init_tensor=torch.ones((domain_size[m], )) /
                    domain_size[m],
                    constraint=constraints.simplex))))

    for c in pyro.plate(name='claims', size=len(claims.index)):
        m = claims.iloc[c]['object_id']
        s = claims.iloc[c]['source_id']
        y_m = hidden_truth[m]
        probs = _build_obj_probs_from_src_honest(pyro.param(f'theta_s_{s}'),
                                                 domain_size[m], y_m)
        pyro.sample(f'b_{s}_{c}', dist.Categorical(probs=probs))
        

Ideas:

    1. because each object has different domain, it is not possible to batch all of their
    distribution, we relax this by expanding all objects' domain to be the biggest domain among 
    all objects. For example if we have three objects with the following domain sizes [3, 2, 5], then we
    just expand [5, 5, 5]. This is a dangerous idea just keep it for now
    
    2. how to perform subsample with this model. Is this possible?

Some synthetic dataset

In [5]:
claims = dict()
claims['source_id'] = [0, 0, 1, 1, 2]
claims['object_id'] = [0, 1, 1, 0, 1]
claims['value'] = [0, 1, 0, 1, 2]
claims = pd.DataFrame(data=claims)
claims

Unnamed: 0,source_id,object_id,value
0,0,0,0
1,0,1,1
2,1,1,0
3,1,0,1
4,2,1,2


# Model $p(x,z)$

In [22]:
def compute_prob_desc(claims):
    problem_sizes = claims.nunique()
    n_sources = problem_sizes['source_id']
    n_objects = problem_sizes['object_id']
    domain_sizes = claims.groupby('object_id').max()['value'] + 1
    return n_sources, n_objects, domain_sizes


#  for c in pyro.plate(name='claims', size=len(claims.index)):
#         m = claims.iloc[c]['object_id']
#         s = claims.iloc[c]['source_id']
#         y_m = hidden_truth[m]
#         probs = _build_obj_probs_from_src_honest(pyro.param(f'theta_s_{s}'),
#                                                  domain_size[m], y_m)
#         pyro.sample(f'b_{s}_{c}', dist.Categorical(probs=probs))
        

def model(claims):
    """a generative model
    
    We assume each source if it asserts an object's value then it is the one and the only assumption
    about that object made by it.
    
    Parameters
    ----------
    claims: pd.DataFrame
        a data frame that has columns [source_id, object_id, value]
    """
    n_sources, n_objects, domain_sizes = compute_prob_desc(claims)
    
    # hidden trusts
    honest_probs = tf.Variable(initial_value=tf.ones(n_sources) * 0.5, name='honest_probs')
    z_trusts = ed.Bernoulli(name=f'z_trusts', probs=honest_probs)
    
    # hidden truths
    object_probs = []
    z_truths = []
    for m in domain_sizes.index:
        object_probs.append(tf.Variable(initial_value=tf.ones(domain_sizes[m],)/domain_sizes[m]))
        z_truths.append(ed.Categorical(name=f'z_truth_{m}', probs=object_probs[m]))
        
    # now generate claims
    x_claims = []
    for c in claims.index:
        s = claims.iloc[c]['source_id']
        m = claims.iloc[c]['object_id']
        z_truth_m = z_truths[m]  
        probs = build_claim_probs(honest_probs[s], domain_sizes[m], z_truth_m.value)
        x_claims.append(ed.Categorical(name=f'x_claim_{c}', probs=probs))
        
def build_claim_probs(honest_prob, domain_size, truth):
    mask = tf.reduce_sum(tf.one_hot([truth], domain_size), axis=0)
    other = tf.ones(domain_size) - mask
    probs = mask*honest_prob*tf.ones(domain_size) + other*((1 - honest_prob) / (domain_size - 1))*tf.ones(domain_size)
    return probs

In [25]:
claims

Unnamed: 0,source_id,object_id,value
0,0,0,0
1,0,1,1
2,1,1,0
3,1,0,1
4,2,1,2


In [27]:
log_prob_fn = ed.make_log_joint_fn(model)
log_prob_fn(claims, **{'z_trusts': [0, 0, 1],
                       'z_truth_0': 1,
                       'z_truth_1': 1,
                       'x_claim_0': 0,
                       'x_claim_1': 1,
                      'x_claim_2': 0,
                      'x_claim_3': 1, 
                      'x_claim_4': 2})

<tf.Tensor: shape=(), dtype=float32, numpy=-8.723231>

Let us verify whether the above computation is correct

In [29]:
log_p_z = tf.math.log(0.5) + tf.math.log(0.5) + tf.math.log(0.5) + tf.math.log(0.5) + tf.math.log(0.33333)

In [31]:
log_p_x_object_1 = tf.math.log(0.5) + tf.math.log(0.5)

In [33]:
log_p_x_object_2 = tf.math.log(0.5) + tf.math.log(0.25) + tf.math.log(0.25)

In [34]:
log_p_z + log_p_x_object_1 + log_p_x_object_2

<tf.Tensor: shape=(), dtype=float32, numpy=-8.723242>

That is our model is correct!

Let's very it against our model joint distribution formula 

# Variational model $p(z)$