In [39]:
import pgmpy
from pgmpy.models import BayesianModel
import numpy as np
import time
import sys
import copy

In [40]:
sys.path.append('/Users/ziniuwu/Desktop/research/BayesNet')
from Structure.model import BN_Single
from Testing.toy_dataset import *

In [157]:
class Pgmpy_BN(BN_Single):
    """
    Build a single Bayesian Network for a single table using pgmpy
    """

    def __init__(self, table_name, method='Pome', debug=True, infer_algo=None):
        """
        infer_algo: inference method, choose between 'exact', 'BP'
        """
        BN_Single.__init__(self, table_name, method, debug)
        self.infer_algo = infer_algo


    def build_from_data(self, dataset, attr_type=None, n_mcv=30, n_bins=60, ignore_cols=['id'],
                        algorithm="chow-liu", drop_na=True, max_parents=-1, root=0, n_jobs=8):
        """ Build the Pomegranate model from data, including structure learning and paramter learning
            ::Param:: dataset: pandas.dataframe
                      attr_type: type of attributes (binary, discrete or continuous)
                      n_mcv: for categorical data we keep the top n most common values and bin the rest
                      n_bins: number of bins for histogram, larger n_bins will provide more accuracy but less efficiency
            for other parameters, pomegranate gives a detailed explaination:
            https://pomegranate.readthedocs.io/en/latest/BayesianNetwork.html
        """
        self.algorithm = algorithm
        if algorithm != "junction":
            discrete_table = self.learn_model_structure(dataset, attr_type, n_mcv, n_bins, ignore_cols, algorithm,
                                                        drop_na, max_parents, root, n_jobs, return_dataset=True)
        else:
            discrete_table = self.learn_model_structure(dataset, attr_type, n_mcv, n_bins, ignore_cols, 'exact',
                                                        drop_na, max_parents, root, n_jobs, return_dataset=True)

        spec = []
        orphans = []
        for i, parents in enumerate(self.structure):
            for p in parents:
                spec.append((self.node_names[p], self.node_names[i]))
            if not parents:
                orphans.append(self.node_names[i])
        if self.debug:
            print(f"Model spec{spec}")
        self.model = BayesianModel(spec)
        for o in orphans:
            self.model.add_node(o)
        print('calling pgm.BayesianModel.fit...')
        t = time.time()
        self.model.fit(discrete_table)
        if algorithm == "junction":
            self.model = self.model.to_junction_tree()
        print(f"done, took {time.time() - t} secs.")
        self.init_inference_method()


    def init_inference_method(self):
        """
        Initial the inference method for query
        """
        if self.infer_algo is None:
            if self.algorithm == "chow-liu":
                self.infer_algo = "exact"
            else:
                self.infer_algo = "BP"

        if self.infer_algo == "exact":
            from pgmpy.inference import VariableElimination
            self.infer_machine = VariableElimination(self.model)
        elif self.infer_algo == "BP":
            from pgmpy.inference import BeliefPropagation
            self.infer_machine = BeliefPropagation(self.model)
            self.infer_machine.calibrate()
        else:
            raise NotImplemented
            
    def one_iter_of_infer(self, query, n_distinct):
        """Performance a BP in random order.
           This adapts the BP implemented in pgympy package itself.
        """
        copy_query = copy.deepcopy(query)
        sampling_order = copy.deepcopy(self.node_names)
        np.random.shuffle(sampling_order)
        
        p_estimate = 1
        for attr in sampling_order:
            if attr in copy_query:
                print(attr)
                val = copy_query.pop(attr)
                p = self.infer_machine.query([attr], evidence=copy_query).values[val]*n_distinct[attr]
                if np.isnan(p):
                    p_estimate = 0
                    break
                p_estimate *= p
        return p_estimate
        
            
    def infer_point_query(self, query, num_samples=1, return_prob=False):
        """Probability inference using Loopy belief propagation. For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:x, Y:y, Z:z}
                     x,y,z can only be a single value
                     num_samples: how many times to run inference, only useful for approaximate algo
                     an approaximation, we might to run it for multiple times and take the average.
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
        """
        nrows = self.nrows
        n_distinct = dict()
        for attr in query:
            encode_value = self.apply_encoding_to_value(query[attr], attr)
            n_distinct[attr] = self.apply_ndistinct_to_value(encode_value, query[attr], attr)
            query[attr] = encode_value
            
        if self.infer_algo = "exact" or self.num_samples == 1:
            #Using topological order to infer probability
            sampling_order = []
            while len(sampling_order) < len(self.structure):
                for i, deps in enumerate(self.structure):
                    if i in sampling_order:
                        continue  # already ordered
                    if all(d in sampling_order for d in deps):
                        sampling_order.append(i)
            sampling_order = [self.node_names[i] for i in sampling_order]
            print(sampling_order)
            
            p_estimate = 1
            for attr in sampling_order:
                if attr in query:
                    print(attr)
                    val = query.pop(attr)
                    p = self.infer_machine.query([attr], evidence=query).values[val]*n_distinct[attr]
                    if np.isnan(p):
                        p_estimate = 0
                        break
                    p_estimate *= p
                        
        else:
            p_estimates = []
            for i in range(num_samples):
                p_estimates.append(self.one_iter_of_infer(query, n_distinct))
            p_estimate = sum(p_estimates)/num_samples
        
        if return_prob:
            return (p_estimate, nrows)
        return round(p_estimate * nrows)
    
    
    def infer_range_query_BP(self, query, num_samples=1, return_prob=False):
        """Probability inference using Loopy belief propagation. For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:[x], Y:[y], Z:[z]}
                     x,y,z can only be set of single value
                     num_samples: how many times to run inference. Since Loopy belief propagation is sometime
                     an approaximation, we might to run it for multiple times and take the average.
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
           LBP for estimating range query can be really slow
        """
        p_estimate = 0
        for query_tuple in cartesian_product(query):
            point_query = dict()
            i = 0
            for attr in query:
                point_query[attr] = query_tuple[i]
                i += 1
            p_estimate += self.infer_point_query_BP(point_query, return_prob=True)[0]
        
        return p_estimate*self.nrows


    def infer_point_query_exact(self, query, return_prob=False):
        nrows = self.nrows
        n_distinct = dict()
        for attr in query:
            encode_value = self.apply_encoding_to_value(query[attr], attr)
            n_distinct[attr] = self.apply_ndistinct_to_value(encode_value, query[attr], attr)
            query[attr] = encode_value
            
        #Using topological order to infer probability
        sampling_order = []
        while len(sampling_order) < len(self.structure):
            for i, deps in enumerate(self.structure):
                if i in sampling_order:
                    continue  # already ordered
                if all(d in sampling_order for d in deps):
                    sampling_order.append(i)
        sampling_order = [self.node_names[i] for i in sampling_order]
        print(sampling_order)

        p_estimate = 1
        for attr in sampling_order:
            if attr in query:
                print(attr)
                val = query.pop(attr)
                p = self.infer_machine.query([attr], evidence=query).values[val]*n_distinct[attr]
                if np.isnan(p):
                    p_estimate = 0
                    break
                p_estimate *= p
                
        if return_prob:
            return (p_estimate, nrows)
        return round(p_estimate * nrows)


    def infer_point_query(self, query, num_samples=1, return_prob=False):
        """Probability inference using Variable elimination, which is the exact inference algorithm.
            For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:x, Y:y, Z:z}
                     x,y,z can only be a single value
                     num_samples: only useful if using an approaximate inference
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
        """
        assert self.infer_algo is not None, "must call .init_inference_method() first"
        
    

In [4]:
df = toy_data_highly_correlated_cat(nrows=100000, return_df=True)

In [158]:
BN = Pgmpy_BN('title')
BN.build_from_data(df)

Model spec[('cont_attr3', 'cont_attr2'), ('cont_attr1', 'cont_attr3'), ('cont_attr8', 'cont_attr4'), ('cont_attr8', 'cont_attr5'), ('cont_attr3', 'cont_attr6'), ('cont_attr1', 'cont_attr7'), ('cont_attr3', 'cont_attr8')]
calling pgm.BayesianModel.fit...
done, took 0.09728121757507324 secs.


In [159]:
tic = time.time()
print(len(df.query('cont_attr1 == 2').query('cont_attr2 == 3')))
toc = time.time()
print(toc-tic)
BN.init_inference_method()
print(BN.infer_point_query_BP({'cont_attr1': 2, 'cont_attr2': 3}, num_samples=1))
print(time.time()-toc)

Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 5628.68it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 324.38it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 5877.90it/s]
Eliminating: cont_attr3: 100%|██████████| 7/7 [00:00<00:00, 328.28it/s]

1034
0.011144876480102539
cont_attr1
cont_attr2
cont_attr2



Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 4522.16it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 309.04it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 6763.45it/s]
Eliminating: cont_attr6: 100%|██████████| 7/7 [00:00<00:00, 191.53it/s]
Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 7210.84it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 293.13it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 6403.52it/s]
Eliminating: cont_attr6: 100%|██████████| 7/7 [00:00<00:00, 211.09it/s]


cont_attr1
cont_attr2
cont_attr1
cont_attr2


Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 6610.41it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 310.43it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 6499.92it/s]
Eliminating: cont_attr6: 100%|██████████| 7/7 [00:00<00:00, 214.77it/s]
Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 5515.19it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 314.86it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 5552.22it/s]
Eliminating: cont_attr6:   0%|          | 0/7 [00:00<?, ?it/s]

cont_attr1
cont_attr2
cont_attr1


Eliminating: cont_attr6: 100%|██████████| 7/7 [00:00<00:00, 209.34it/s]
Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 5000.16it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 307.19it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 5631.02it/s]
Eliminating: cont_attr6: 100%|██████████| 7/7 [00:00<00:00, 210.63it/s]
Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 6818.16it/s]
Eliminating: cont_attr8:   0%|          | 0/6 [00:00<?, ?it/s]

cont_attr2
cont_attr1
cont_attr2


Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 324.80it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 5743.37it/s]
Eliminating: cont_attr6: 100%|██████████| 7/7 [00:00<00:00, 196.88it/s]
Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 6934.64it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 264.75it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 5604.15it/s]
Eliminating: cont_attr8:   0%|          | 0/7 [00:00<?, ?it/s]

cont_attr1
cont_attr1
cont_attr2


Eliminating: cont_attr3: 100%|██████████| 7/7 [00:00<00:00, 271.60it/s]
Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 4889.42it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 297.32it/s]
Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 6603.72it/s]
Eliminating: cont_attr3: 100%|██████████| 7/7 [00:00<00:00, 312.58it/s]
Finding Elimination Order: : 100%|██████████| 6/6 [00:00<00:00, 4619.28it/s]
Eliminating: cont_attr3: 100%|██████████| 6/6 [00:00<00:00, 317.58it/s]


cont_attr1
cont_attr2
cont_attr1
cont_attr2


Finding Elimination Order: : 100%|██████████| 7/7 [00:00<00:00, 6381.25it/s]
Eliminating: cont_attr3: 100%|██████████| 7/7 [00:00<00:00, 305.98it/s]

1034.0
1.544321060180664





In [28]:
from pgmpy.inference import VariableElimination
inference = VariableElimination(BN.model)

In [35]:
tic = time.time()
print(inference.query(['cont_attr1'], evidence={'cont_attr3': [0,1], 'cont_attr4': [0,1]}))
print(time.time()-tic)

TypeError: unhashable type: 'list'

In [19]:
a = inference.query(['cont_attr1'], evidence={'cont_attr3': [0,1], 'cont_attr4': [0,1]}))

Finding Elimination Order: : 100%|██████████| 4/4 [00:00<00:00, 3370.27it/s]
Eliminating: cont_attr8: 100%|██████████| 4/4 [00:00<00:00, 339.67it/s]


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [36]:
from pgmpy.inference import BeliefPropagation
infer_machine = BeliefPropagation(BN.model)
infer_machine.calibrate()

In [127]:
a = infer_machine.query(['cont_attr1'], evidence={'cont_attr2':1, 'cont_attr4':18})

Eliminating: cont_attr8: 100%|██████████| 3/3 [00:00<00:00, 303.98it/s]


In [128]:
print(a)

+---------------+-------------------+
| cont_attr1    |   phi(cont_attr1) |
| cont_attr1(0) |               nan |
+---------------+-------------------+
| cont_attr1(1) |               nan |
+---------------+-------------------+
| cont_attr1(2) |               nan |
+---------------+-------------------+
| cont_attr1(3) |               nan |
+---------------+-------------------+
| cont_attr1(4) |               nan |
+---------------+-------------------+
| cont_attr1(5) |               nan |
+---------------+-------------------+
| cont_attr1(6) |               nan |
+---------------+-------------------+
| cont_attr1(7) |               nan |
+---------------+-------------------+
| cont_attr1(8) |               nan |
+---------------+-------------------+
| cont_attr1(9) |               nan |
+---------------+-------------------+


In [129]:
np.isnan(a.values)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [152]:
round(99.765)

100