In [20]:
import pomegranate
import copy
import random
import time
import tools
import torch
import numpy as np
import pandas as pd
import math
from model import Single_BN
from collections import defaultdict
import torch
import scipy.stats
from torch.distributions import constraints
from matplotlib import pyplot
%matplotlib inline

In [2]:
import pyro
import pyro.distributions as dist
from pyro import poutine
from pyro.infer.autoguide import AutoDelta
from pyro.optim import Adam
from pyro.infer import SVI, TraceEnum_ELBO, config_enumerate, infer_discrete
pyro.enable_validation(True)

In [73]:
def toy_table(nrows = 10000):
    """
    Create some toy table for evaluation and debug purposes
    """
    vocab = ['a', 'b', 'c', 'd', 'e']
    attr1 = np.random.randint(10, size=nrows)
    attr2 = np.random.randint(3, size=nrows)+10
    attr3 = np.random.normal(3, 100, size=nrows)
    attr4 = attr1+attr2+np.random.normal(0, 1, size=nrows)
    attr5 = attr3*2.5+1.5
    dataset = pd.DataFrame({'attr1': attr1, 'attr2': attr2, 'attr3': attr3, 'attr4': attr4, 'attr5': attr5})
    return dataset

In [65]:
def GMM(data, K=1, lr=0.01):
    data = torch.tensor(data.values).type(torch.FloatTensor)
    K = 1
    @config_enumerate
    def model(data):
        # Global variables.
        weights = pyro.sample('weights', dist.Dirichlet(0.5 * torch.ones(K)))
        with pyro.plate('components', K):
            scales = pyro.sample('scales', dist.LogNormal(0., 20.))
            locs = pyro.sample('locs', dist.Normal(0., 10.))

        with pyro.plate('data', len(data)):
            # Local variables.
            assignment = pyro.sample('assignment', dist.Categorical(weights))
            pyro.sample('obs', dist.Normal(locs[assignment], scales[assignment]), obs=data)
    
    optim = pyro.optim.Adam({'lr': lr, 'betas': [0.8, 0.99]})
    elbo = TraceEnum_ELBO(max_plate_nesting=1)
    
    def init_loc_fn(site):
        if site["name"] == "weights":
            # Initialize weights to uniform.
            return torch.ones(K) / K
        if site["name"] == "scales":
            return torch.tensor([(data.var() / 2).sqrt()]*K)
        if site["name"] == "locs":
            return data[torch.multinomial(torch.ones(len(data)) / len(data), K)]
        raise ValueError(site["name"])

    def initialize(seed):
        global global_guide, svi
        pyro.set_rng_seed(seed)
        pyro.clear_param_store()
        global_guide = AutoDelta(poutine.block(model, expose=['weights', 'locs', 'scales']),
                                 init_loc_fn=init_loc_fn)
        svi = SVI(model, global_guide, optim, loss=elbo)
        return svi.loss(model, global_guide, data)

    # Choose the best among 100 random initializations.
    loss, seed = min((initialize(seed), seed) for seed in range(100))
    initialize(seed)
    print('seed = {}, initial_loss = {}'.format(seed, loss))
    gradient_norms = defaultdict(list)
    for name, value in pyro.get_param_store().named_parameters():
        value.register_hook(lambda g, name=name: gradient_norms[name].append(g.norm().item()))

    losses = []
    for i in range(50):
        loss = svi.step(data)
        losses.append(loss)
        print('.' if i % 10 else '\n', end='')
        
    map_estimates = global_guide(data)
    weights = map_estimates['weights']
    locs = map_estimates['locs']
    scales = map_estimates['scales']
    print('weights = {}'.format(weights.data.numpy()))
    print('locs = {}'.format(locs.data.numpy()))
    print('scale = {}'.format(scales.data.numpy()))
    return weights.data.numpy(), locs.data.numpy(), scales.data.numpy()

In [74]:
df = toy_table()
GMM(df['attr3'].values)

In [78]:
df['attr3'].nunique()

10000

In [None]:
def conditional_categorical(data):
    n_cat = data.nunique()
    data = torch.tensor(data.values).type(torch.FloatTensor)
    def model(data):
        # Global variables.
        probs = pyro.sample('probs', dist.Dirichlet(0.5 * torch.ones(n_cat)))
        with pyro.plate('data', len(data)):
            # Local variables.
            pyro.sample('obs', dist.Categorical(probs), obs=data)
            
    optim = pyro.optim.Adam({'lr': 0.1, 'betas': [0.8, 0.99]})
    elbo = TraceEnum_ELBO(max_plate_nesting=1)
    
    def init_loc_fn(site):
        if site["name"] == "probs":
            # Initialize probs to uniform.
            return torch.ones(n_cat) / n_cat
        raise ValueError(site["name"])
    
    def initialize(seed):
        global global_guide, svi
        pyro.set_rng_seed(seed)
        pyro.clear_param_store()
        global_guide = AutoDelta(poutine.block(model, expose=['probs']),
                                 init_loc_fn=init_loc_fn)
        svi = SVI(model, global_guide, optim, loss=elbo)
        return svi.loss(model, global_guide, data)
    
    loss, seed = min((initialize(seed), seed) for seed in range(100))
    initialize(seed)
    print('seed = {}, initial_loss = {}'.format(seed, loss))
    gradient_norms = defaultdict(list)
    for name, value in pyro.get_param_store().named_parameters():
        value.register_hook(lambda g, name=name: gradient_norms[name].append(g.norm().item()))

    losses = []
    for i in range(50):
        loss = svi.step(data)
        losses.append(loss)
        print('.' if i % 10 else '\n', end='')
        
    map_estimates = global_guide(data)
    probs = map_estimates['probs']
    print('probs = {}'.format(probs.data.numpy()))
    return probs.data.numpy()

In [79]:
def categorical(data):
    n_cat = data.nunique()
    data = torch.tensor(data.values).type(torch.FloatTensor)
    def model(data):
        # Global variables.
        probs = pyro.sample('probs', dist.Dirichlet(0.5 * torch.ones(n_cat)))
        with pyro.plate('data', len(data)):
            # Local variables.
            pyro.sample('obs', dist.Categorical(probs), obs=data)
    optim = pyro.optim.Adam({'lr': 0.1, 'betas': [0.8, 0.99]})
    elbo = TraceEnum_ELBO(max_plate_nesting=1)
    
    def init_loc_fn(site):
        if site["name"] == "probs":
            # Initialize probs to uniform.
            return torch.ones(n_cat) / n_cat
        raise ValueError(site["name"])
    
    def initialize(seed):
        global global_guide, svi
        pyro.set_rng_seed(seed)
        pyro.clear_param_store()
        global_guide = AutoDelta(poutine.block(model, expose=['probs']),
                                 init_loc_fn=init_loc_fn)
        svi = SVI(model, global_guide, optim, loss=elbo)
        return svi.loss(model, global_guide, data)
    
    loss, seed = min((initialize(seed), seed) for seed in range(100))
    initialize(seed)
    print('seed = {}, initial_loss = {}'.format(seed, loss))
    gradient_norms = defaultdict(list)
    for name, value in pyro.get_param_store().named_parameters():
        value.register_hook(lambda g, name=name: gradient_norms[name].append(g.norm().item()))

    losses = []
    for i in range(50):
        loss = svi.step(data)
        losses.append(loss)
        print('.' if i % 10 else '\n', end='')
        
    map_estimates = global_guide(data)
    probs = map_estimates['probs']
    print('probs = {}'.format(probs.data.numpy()))
    return probs.data.numpy()

In [80]:
categorical(df['attr1'])



seed = 0, initial_loss = 23018.04296875

.........
.........
.........
.........
.........probs = [0.10251568 0.09592286 0.09830005 0.09840512 0.10396029 0.10000031
 0.0996993  0.09639328 0.09889549 0.10590766]


array([0.10251568, 0.09592286, 0.09830005, 0.09840512, 0.10396029,
       0.10000031, 0.0996993 , 0.09639328, 0.09889549, 0.10590766],
      dtype=float32)

In [83]:
type(df['attr1'])==pd.core.series.Series

True

In [84]:
type(np.ones(1))

numpy.ndarray