In [1]:
import pgmpy
from pgmpy.models import BayesianModel
import numpy as np
import time
import sys
import copy

In [2]:
sys.path.append('/Users/ziniuwu/Desktop/research/BayesNet')
from Structure.BN_single_model import BN_Single
from Testing.toy_dataset import *
from Inference.utils import cartesian_product

In [3]:
class Pgmpy_BN(BN_Single):
    """
    Build a single Bayesian Network for a single table using pgmpy
    """

    def __init__(self, table_name, method='Pome', debug=True, infer_algo=None):
        """
        infer_algo: inference method, choose between 'exact', 'BP'
        """
        BN_Single.__init__(self, table_name, method, debug)
        self.infer_algo = infer_algo


    def build_from_data(self, dataset, attr_type=None, sample_size=1000000, n_mcv=30, n_bins=60, ignore_cols=['id'],
                        algorithm="chow-liu", drop_na=True, max_parents=-1, root=0, n_jobs=8):
        """ Build the Pomegranate model from data, including structure learning and paramter learning
            ::Param:: dataset: pandas.dataframe
                      attr_type: type of attributes (binary, discrete or continuous)
                      sample_size: subsample the number of rows to use to learn structure
                      n_mcv: for categorical data we keep the top n most common values and bin the rest
                      n_bins: number of bins for histogram, larger n_bins will provide more accuracy but less efficiency
            for other parameters, pomegranate gives a detailed explaination:
            https://pomegranate.readthedocs.io/en/latest/BayesianNetwork.html
        """
        self.algorithm = algorithm
        if algorithm != "junction":
            discrete_table = self.learn_model_structure(dataset, attr_type, sample_size,
                                                        n_mcv, n_bins, ignore_cols, algorithm,
                                                        drop_na, max_parents, root, n_jobs, return_dataset=True)
        else:
            discrete_table = self.learn_model_structure(dataset, attr_type, sample_size,
                                                        n_mcv, n_bins, ignore_cols, 'exact',
                                                        drop_na, max_parents, root, n_jobs, return_dataset=True)

        spec = []
        orphans = []
        for i, parents in enumerate(self.structure):
            for p in parents:
                spec.append((self.node_names[p], self.node_names[i]))
            if not parents:
                orphans.append(self.node_names[i])
        if self.debug:
            print(f"Model spec{spec}")
        self.model = BayesianModel(spec)
        for o in orphans:
            self.model.add_node(o)
        print('calling pgm.BayesianModel.fit...')
        t = time.time()
        self.model.fit(discrete_table)
        if algorithm == "junction":
            try:
                self.model = self.model.to_junction_tree()
            except:
                self.model = self.model
                print("This BN is not able to transform into junction tree, just use BN")
        print(f"done, took {time.time() - t} secs.")
        self.init_inference_method()


    def init_inference_method(self):
        """
        Initial the inference method for query
        """
        if self.infer_algo is None:
            if self.algorithm == "junction":
                self.infer_algo = "BP"
            else:
                self.infer_algo = "exact"

        if self.infer_algo == "exact":
            from pgmpy.inference import VariableElimination
            self.infer_machine = VariableElimination(self.model)
        elif self.infer_algo == "BP":
            from pgmpy.inference import BeliefPropagation
            self.infer_machine = BeliefPropagation(self.model)
            self.infer_machine.calibrate()
        else:
            raise NotImplemented
            
    def one_iter_of_infer(self, query, n_distinct):
        """Performance a BP in random order.
           This adapts the BP implemented in pgympy package itself.
        """
        copy_query = copy.deepcopy(query)
        sampling_order = copy.deepcopy(self.node_names)
        np.random.shuffle(sampling_order)
        
        p_estimate = 1
        for attr in sampling_order:
            if attr in query:
                val = query.pop(attr)
                probs = self.infer_machine.query([attr], evidence=query).values
                if any(np.isnan(probs)):
                    p_estimate = 0
                    break
                p = probs[val]/(np.sum(probs))*n_distinct[attr]
                p_estimate *= p
                        
        return p_estimate
        
            
    def infer_point_query(self, query, num_samples=1, return_prob=False):
        """Probability inference using Loopy belief propagation. For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:x, Y:y, Z:z}
                     x,y,z can only be a single value
                     num_samples: how many times to run inference, only useful for approaximate algo
                     an approaximation, we might to run it for multiple times and take the average.
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
        """
        assert self.infer_algo is not None, "must call .init_inference_method() first"
        
        nrows = self.nrows
        n_distinct = dict()
        for attr in query:
            encode_value = self.apply_encoding_to_value(query[attr], attr)
            if encode_value is None:
                return (0, nrows) if return_prob else 0
            n_distinct[attr] = self.apply_ndistinct_to_value(encode_value, query[attr], attr)
            query[attr] = encode_value
            
        if self.infer_algo == "exact" or num_samples == 1:
            #Using topological order to infer probability
            sampling_order = []
            while len(sampling_order) < len(self.structure):
                for i, deps in enumerate(self.structure):
                    if i in sampling_order:
                        continue  # already ordered
                    if all(d in sampling_order for d in deps):
                        sampling_order.append(i)
            sampling_order = [self.node_names[i] for i in sampling_order]
            
            p_estimate = 1
            for attr in sampling_order:
                if attr in query:
                    val = query.pop(attr)
                    probs = self.infer_machine.query([attr], evidence=query).values
                    if any(np.isnan(probs)):
                        p_estimate = 0
                        break
                    p = probs[val]/(np.sum(probs))*n_distinct[attr]
                    p_estimate *= p
                        
        else:
            p_estimates = []
            for i in range(num_samples):
                p_estimates.append(self.one_iter_of_infer(query, n_distinct))
            p_estimate = sum(p_estimates)/num_samples
        
        print(p_estimate)
        if return_prob:
            return (p_estimate, nrows)
        return round(p_estimate * nrows)
    
    
    def infer_query(self, query, num_samples=1, return_prob=False):
        """Probability inference using Loopy belief propagation. For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:[x], Y:[y], Z:[z]}
                     x,y,z can only be set of single value
                     num_samples: how many times to run inference. Since Loopy belief propagation is sometime
                     an approaximation, we might to run it for multiple times and take the average.
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
           LBP for estimating range query can be really slow
        """
        p_estimate = 0
        for query_tuple in cartesian_product(query):
            point_query = dict()
            i = 0
            for attr in query:
                point_query[attr] = query_tuple[i]
                i += 1
            print(point_query)
            p_estimate += self.infer_point_query(point_query, return_prob=True)[0]
        
        return round(p_estimate*self.nrows)
    
    

    

In [4]:
df = pd.read_hdf("/home/ziniu.wzn/imdb-benchmark/gen_single_light/title.hdf")
print(len(df))
new_cols = []
for col in df.columns:
    new_cols.append(col.replace('.', '__'))
df.columns = new_cols
print(df.head(20))

Unnamed: 0,title.id,title.kind_id,title.production_year,title.mul_movie_info_idx.movie_id,title.mul_movie_info_idx.movie_id_nn,title.mul_movie_info.movie_id,title.mul_movie_info.movie_id_nn,title.mul_cast_info.movie_id,title.mul_cast_info.movie_id_nn,title.mul_movie_keyword.movie_id,title.mul_movie_keyword.movie_id_nn,title.mul_movie_companies.movie_id,title.mul_movie_companies.movie_id_nn
80889,80889,7,1980.0,0.0,1.0,8.0,8.0,0.0,1.0,0.0,1.0,0.0,1.0
5156,5156,7,2010.0,0.0,1.0,1.0,1.0,11.0,11.0,0.0,1.0,0.0,1.0
197772,197772,7,1962.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
111913,111913,7,2012.0,0.0,1.0,1.0,1.0,12.0,12.0,0.0,1.0,0.0,1.0
117556,117556,7,1992.788762,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
40704,40704,7,1971.0,3.0,3.0,11.0,11.0,24.0,24.0,0.0,1.0,5.0,5.0
164312,164312,7,1997.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
149337,149337,7,2005.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
32020,32020,7,2011.0,0.0,1.0,1.0,1.0,9.0,9.0,0.0,1.0,0.0,1.0
36858,36858,7,2012.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0


In [5]:
toy_df1 = toy_data_highly_correlated_cont(nrows=10000, return_df=True)
toy_df2 = toy_data_highly_correlated_cont(nrows=40000, return_df=True)

In [42]:
BN = Pgmpy_BN('title')
BN.build_from_data(df, algorithm="chow-liu", max_parents=2, ignore_cols=['title_id'], sample_size=500000)

Model spec[('title_mul_movie_info_movie_id', 'title_production_year'), ('title_mul_movie_info_movie_id', 'title_mul_movie_info_idx_movie_id'), ('title_mul_movie_info_idx_movie_id', 'title_mul_movie_info_idx_movie_id_nn'), ('title_kind_id', 'title_mul_movie_info_movie_id'), ('title_mul_movie_info_movie_id', 'title_mul_movie_info_movie_id_nn'), ('title_mul_movie_info_movie_id', 'title_mul_cast_info_movie_id'), ('title_mul_cast_info_movie_id', 'title_mul_cast_info_movie_id_nn'), ('title_mul_movie_info_movie_id', 'title_mul_movie_keyword_movie_id'), ('title_mul_movie_keyword_movie_id', 'title_mul_movie_keyword_movie_id_nn'), ('title_mul_movie_info_movie_id', 'title_mul_movie_companies_movie_id'), ('title_mul_movie_companies_movie_id', 'title_mul_movie_companies_movie_id_nn')]
calling pgm.BayesianModel.fit...
done, took 5.809454441070557 secs.


In [27]:
print(BN.encoding['title_production_year'])
print(BN.n_in_bin['title_production_year'])

{2012.0: 0, 2011.0: 1, 2010.0: 2, 2009.0: 3, 2008.0: 4, 2007.0: 5, 2006.0: 6, 2005.0: 7, 2004.0: 8, 1992.7887619952553: 9, 2003.0: 10, 2013.0: 11, 2002.0: 12, 2001.0: 13, 2000.0: 14, 1999.0: 15, 1998.0: 16, 1997.0: 17, 1996.0: 18, 1995.0: 19, 1994.0: 20, 1993.0: 21, 1992.0: 22, 1991.0: 23, 1990.0: 24, 1989.0: 25, 1987.0: 26, 1988.0: 27, 1986.0: 28, 1985.0: 29, 1880.0: 30, 1888.0: 30, 1889.0: 30, 1890.0: 30, 1891.0: 30, 1892.0: 30, 1893.0: 30, 1894.0: 30, 1895.0: 30, 1896.0: 30, 1897.0: 30, 1898.0: 30, 1899.0: 30, 1900.0: 30, 1901.0: 30, 1902.0: 30, 1903.0: 31, 1904.0: 31, 1905.0: 31, 1906.0: 31, 1907.0: 31, 1908.0: 31, 1909.0: 32, 1910.0: 32, 1911.0: 32, 1912.0: 33, 1913.0: 33, 1914.0: 34, 1915.0: 34, 1916.0: 35, 1917.0: 35, 1918.0: 36, 1919.0: 36, 1920.0: 36, 1921.0: 37, 1922.0: 37, 1923.0: 37, 1924.0: 37, 1925.0: 38, 1926.0: 38, 1927.0: 38, 1928.0: 38, 1929.0: 39, 1930.0: 39, 1931.0: 39, 1932.0: 39, 1933.0: 40, 1934.0: 40, 1935.0: 40, 1936.0: 40, 1937.0: 41, 1938.0: 41, 1939.0: 41, 1

In [40]:
tic = time.time()
print(len(df.query('title_mul_cast_info_movie_id_nn == [3,12,24]').query('title_kind_id in [7,2]').query('title_production_year == 2011.0')))
toc = time.time()
print(toc-tic)
BN.init_inference_method()
print(BN.infer_query({'title_mul_cast_info_movie_id_nn' : [3,12,24], 'title_kind_id': [7,2], 'title_production_year': 2011}, num_samples=1))
print(time.time()-toc)

Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 3560.53it/s]
Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 9/9 [00:00<00:00, 294.04it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 3914.79it/s]
Eliminating: title_mul_cast_info_movie_id_nn:   0%|          | 0/10 [00:00<?, ?it/s]    

9615
0.15904784202575684
{'title_mul_cast_info_movie_id_nn': 3, 'title_kind_id': 7, 'title_production_year': 2011}


Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 10/10 [00:00<00:00, 149.31it/s]
Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 4270.00it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 11/11 [00:00<00:00, 124.75it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 3682.09it/s]
Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 9/9 [00:00<00:00, 303.72it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 5267.90it/s]
Eliminating: title_mul_movie_keyword_movie_id:   0%|          | 0/10 [00:00<?, ?it/s]   

0.002475891128469264
{'title_mul_cast_info_movie_id_nn': 3, 'title_kind_id': 2, 'title_production_year': 2011}


Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 10/10 [00:00<00:00, 152.86it/s]
Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 4358.75it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 11/11 [00:00<00:00, 123.97it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 3208.56it/s]
Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 9/9 [00:00<00:00, 261.59it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 3309.90it/s]
Eliminating: title_mul_cast_info_movie_id_nn:   0%|          | 0/10 [00:00<?, ?it/s]    

0.00014947010450651368
{'title_mul_cast_info_movie_id_nn': 12, 'title_kind_id': 7, 'title_production_year': 2011}


Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 10/10 [00:00<00:00, 138.43it/s]
Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 3512.28it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 11/11 [00:00<00:00, 114.81it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 3368.62it/s]
Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 9/9 [00:00<00:00, 245.61it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 3356.79it/s]
Eliminating: title_mul_cast_info_movie_id_nn:   0%|          | 0/10 [00:00<?, ?it/s]    

0.000882831796154786
{'title_mul_cast_info_movie_id_nn': 12, 'title_kind_id': 2, 'title_production_year': 2011}


Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 10/10 [00:00<00:00, 137.18it/s]
Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 3457.53it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 11/11 [00:00<00:00, 113.98it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 3229.70it/s]
Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 9/9 [00:00<00:00, 255.09it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 3342.61it/s]

4.189348575430987e-05
{'title_mul_cast_info_movie_id_nn': 24, 'title_kind_id': 7, 'title_production_year': 2011}



Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 10/10 [00:00<00:00, 135.76it/s]
Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 3572.11it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 11/11 [00:00<00:00, 113.99it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 3161.80it/s]
Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 9/9 [00:00<00:00, 248.49it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 3306.77it/s]
Eliminating: title_mul_movie_keyword_movie_id:   0%|          | 0/10 [00:00<?, ?it/s]   

0.0004838784973790043
{'title_mul_cast_info_movie_id_nn': 24, 'title_kind_id': 2, 'title_production_year': 2011}


Eliminating: title_mul_cast_info_movie_id: 100%|██████████| 10/10 [00:00<00:00, 140.35it/s]
Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 3689.51it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 11/11 [00:00<00:00, 114.96it/s]

1.2339777172819369e-05
10230.0
2.5411431789398193





In [20]:
infer_range_query(BN, {'title.mul_movie_info_idx.movie_id_nn': [3,12], 'title.kind_id': [7,2], 'title.production_year': [2011.0, 2006.0]}, BN.infer_machine)

Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 5242.88it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 9/9 [00:00<00:00, 295.50it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 5338.30it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 10/10 [00:00<00:00, 282.01it/s]

{'title.mul_movie_info_idx.movie_id_nn': 3, 'title.kind_id': 7, 'title.production_year': 2011.0}
title.kind_id 0
{'title.mul_movie_info_idx.movie_id_nn': 1, 'title.production_year': 1}
title.production_year 1
{'title.mul_movie_info_idx.movie_id_nn': 1}
title.mul_movie_info_idx.movie_id_nn 1
{}



Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 5702.30it/s]
Eliminating: title.mul_movie_companies.movie_id: 100%|██████████| 11/11 [00:00<00:00, 190.15it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 5565.20it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 9/9 [00:00<00:00, 303.37it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 5567.91it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 10/10 [00:00<00:00, 288.19it/s]


{'title.mul_movie_info_idx.movie_id_nn': 3, 'title.kind_id': 7, 'title.production_year': 2006.0}
title.kind_id 0
{'title.mul_movie_info_idx.movie_id_nn': 1, 'title.production_year': 6}
title.production_year 6
{'title.mul_movie_info_idx.movie_id_nn': 1}
title.mul_movie_info_idx.movie_id_nn 1
{}


Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 6005.90it/s]
Eliminating: title.mul_movie_companies.movie_id: 100%|██████████| 11/11 [00:00<00:00, 190.05it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 5560.28it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 9/9 [00:00<00:00, 294.35it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 5794.84it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 10/10 [00:00<00:00, 293.40it/s]


{'title.mul_movie_info_idx.movie_id_nn': 3, 'title.kind_id': 2, 'title.production_year': 2011.0}
title.kind_id 4
{'title.mul_movie_info_idx.movie_id_nn': 1, 'title.production_year': 1}
title.production_year 1
{'title.mul_movie_info_idx.movie_id_nn': 1}
title.mul_movie_info_idx.movie_id_nn 1
{}


Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 5841.65it/s]
Eliminating: title.mul_movie_companies.movie_id: 100%|██████████| 11/11 [00:00<00:00, 186.21it/s]
Finding Elimination Order: : 100%|██████████| 9/9 [00:00<00:00, 4167.45it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 9/9 [00:00<00:00, 300.04it/s]
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 5626.16it/s]
Eliminating: title.mul_movie_info.movie_id: 100%|██████████| 10/10 [00:00<00:00, 299.43it/s]


{'title.mul_movie_info_idx.movie_id_nn': 3, 'title.kind_id': 2, 'title.production_year': 2006.0}
title.kind_id 4
{'title.mul_movie_info_idx.movie_id_nn': 1, 'title.production_year': 6}
title.production_year 6
{'title.mul_movie_info_idx.movie_id_nn': 1}
title.mul_movie_info_idx.movie_id_nn 1
{}


Finding Elimination Order: : 100%|██████████| 11/11 [00:00<00:00, 6081.91it/s]
Eliminating: title.mul_movie_companies.movie_id: 100%|██████████| 11/11 [00:00<00:00, 190.53it/s]


{'title.mul_movie_info_idx.movie_id_nn': 12, 'title.kind_id': 7, 'title.production_year': 2011.0}
title.kind_id 0
{'title.mul_movie_info_idx.movie_id_nn': 12, 'title.production_year': 1}


KeyError: 12

In [None]:
from pgmpy.inference import VariableElimination
inference = VariableElimination(BN.model)

In [None]:
tic = time.time()
print(inference.query(['cont_attr1'], evidence={'cont_attr3': [0,1], 'cont_attr4': [0,1]}))
print(time.time()-tic)

In [37]:
def temp(BN, query, infer_machine, return_prob=False):
    nrows = BN.nrows
    n_distinct = dict()
    for attr in query:
        encode_value = BN.apply_encoding_to_value(query[attr], attr)
        n_distinct[attr] = BN.apply_ndistinct_to_value(encode_value, query[attr], attr)
        query[attr] = encode_value
        
    sampling_order = []
    while len(sampling_order) < len(BN.structure):
        for i, deps in enumerate(BN.structure):
            if i in sampling_order:
                continue  # already ordered
            if all(d in sampling_order for d in deps):
                sampling_order.append(i)
    sampling_order = [BN.node_names[i] for i in sampling_order]

    p_estimate = 1
    for attr in sampling_order:
        if attr in query:
            val = query.pop(attr)
            print(attr, val)
            print(query)
            probs = infer_machine.query([attr], evidence=query).values
            if any(np.isnan(probs)):
                p_estimate = 0
                break
            p = probs[val]/(np.sum(probs))*n_distinct[attr]
            p_estimate *= p
            
    if return_prob:
        return (p_estimate, nrows)
    return round(p_estimate * nrows)

def infer_range_query(BN, query, infer_machine, num_samples=1, return_prob=False):
    p_estimate = 0
    for query_tuple in cartesian_product(query):
        point_query = dict()
        i = 0
        for attr in query:
            point_query[attr] = query_tuple[i]
            i += 1
        print(point_query)
        p_estimate += temp(BN, point_query, infer_machine, return_prob=True)[0]

    return round(p_estimate*BN.nrows)

In [None]:
tic = time.time()
print(infer_range_query(BN,{'cont_attr1': [2,3], 'cont_attr2': [2,3], 'cont_attr3': [5,6,7,8]}, infer_machine))
print(time.time()-tic)

In [43]:
from pgmpy.inference import BeliefPropagation
infer_machine = BeliefPropagation(BN.model)
infer_machine.calibrate()

In [44]:
infer_range_query(BN, {'title_mul_cast_info_movie_id_nn' : [3,12,24], 'title_kind_id': [7,2], 'title_production_year': 2011}, infer_machine)

Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 2/2 [00:00<00:00, 342.95it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 3/3 [00:00<00:00, 236.26it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 1/1 [00:00<00:00, 262.85it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 2/2 [00:00<00:00, 301.94it/s]

{'title_mul_cast_info_movie_id_nn': 3, 'title_kind_id': 7, 'title_production_year': 2011}
title_kind_id 0
{'title_mul_cast_info_movie_id_nn': 3, 'title_production_year': 1}
title_mul_cast_info_movie_id_nn 3
{'title_production_year': 1}
title_production_year 1
{}
{'title_mul_cast_info_movie_id_nn': 3, 'title_kind_id': 2, 'title_production_year': 2011}
title_kind_id 4
{'title_mul_cast_info_movie_id_nn': 3, 'title_production_year': 1}
title_mul_cast_info_movie_id_nn 3
{'title_production_year': 1}



Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 3/3 [00:00<00:00, 235.75it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 1/1 [00:00<00:00, 249.72it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 2/2 [00:00<00:00, 327.65it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 3/3 [00:00<00:00, 245.44it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 1/1 [00:00<00:00, 263.44it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 2/2 [00:00<00:00, 297.02it/s]


title_production_year 1
{}
{'title_mul_cast_info_movie_id_nn': 12, 'title_kind_id': 7, 'title_production_year': 2011}
title_kind_id 0
{'title_mul_cast_info_movie_id_nn': 11, 'title_production_year': 1}
title_mul_cast_info_movie_id_nn 11
{'title_production_year': 1}
title_production_year 1
{}
{'title_mul_cast_info_movie_id_nn': 12, 'title_kind_id': 2, 'title_production_year': 2011}
title_kind_id 4
{'title_mul_cast_info_movie_id_nn': 11, 'title_production_year': 1}
title_mul_cast_info_movie_id_nn 11
{'title_production_year': 1}


Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 3/3 [00:00<00:00, 247.09it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 1/1 [00:00<00:00, 261.10it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 2/2 [00:00<00:00, 321.16it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 3/3 [00:00<00:00, 239.28it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 1/1 [00:00<00:00, 261.15it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 2/2 [00:00<00:00, 328.97it/s]


title_production_year 1
{}
{'title_mul_cast_info_movie_id_nn': 24, 'title_kind_id': 7, 'title_production_year': 2011}
title_kind_id 0
{'title_mul_cast_info_movie_id_nn': 23, 'title_production_year': 1}
title_mul_cast_info_movie_id_nn 23
{'title_production_year': 1}
title_production_year 1
{}
{'title_mul_cast_info_movie_id_nn': 24, 'title_kind_id': 2, 'title_production_year': 2011}
title_kind_id 4
{'title_mul_cast_info_movie_id_nn': 23, 'title_production_year': 1}
title_mul_cast_info_movie_id_nn 23
{'title_production_year': 1}


Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 3/3 [00:00<00:00, 243.43it/s]
Eliminating: title_mul_movie_info_movie_id: 100%|██████████| 1/1 [00:00<00:00, 262.52it/s]

title_production_year 1
{}





10757.0

In [None]:
print(a)

In [None]:
np.isnan(a.values)

In [None]:
any(np.isnan(a.values))

In [2]:
import numpy as np
idx = np.random.choice(1000, size=5)

In [3]:
idx

array([642, 238,  45, 253, 234])

In [15]:
sorted(list(set(toy_df["cont_attr1"].iloc[idx])))

[-23.123648636438254,
 -8.761486032223727,
 -6.386568342901904,
 8.525174615562019,
 20.68819772963036]

In [12]:
str([1,23])

'[1, 23]'

In [1]:
type((1,3))

tuple

In [None]:
toy_df1 = toy_data_highly_correlated_cont(nrows=10000, return_df=True)
toy_df2 = toy_data_highly_correlated_cont(nrows=40000, return_df=True)

In [7]:
new_df = toy_df1.set_index('id').join(toy_df2.set_index('id'), rsuffix='_other')

In [12]:
new_df = pd.merge(toy_df1, toy_df2, on='id', how='outer')

In [13]:
len(new_df)

40000

In [14]:
def toy_df_for_merge(nrows=10000, id_name='id', id_num=10000):
    """
    Create some highly correlated toy table for evaluation and debug purposes
    """

    attr1 = np.random.randint(10, size=nrows)
    attr2 = attr1 + np.random.choice([0, 1], size=nrows, p=[0.9, 0.1])
    attr3 = attr1 + attr2
    attr4 = attr3 + np.random.choice([0, 1, 2], size=nrows, p=[0.8, 0.1, 0.1])
    attr5 = attr3 + attr4
    attr6 = attr2 * 2 + np.random.randint(10, size=nrows)
    attr7 = attr1 + 10
    attr8 = attr1 + attr4 + attr5 + attr7
    id = np.random.randint(id_num, size=nrows)
    # return pandas dataframe
    dataset = pd.DataFrame({id_name: id, 'cont_attr1': attr1, 'cont_attr2': attr2, 'cont_attr3': attr3,
                            'cont_attr4': attr4,
                            'cont_attr5': attr5, 'cont_attr6': attr6, 'cont_attr7': attr7, 'cont_attr8': attr8})

    return dataset.apply(pd.to_numeric, errors="ignore")

In [19]:
toy_df3 = toy_df_for_merge(10000, 'id3')
toy_df4 = toy_df_for_merge(40000, 'id4')

In [21]:
new_df2 = pd.merge(toy_df1, toy_df3, left_on='id', right_on='id3', how='outer')
print(len(new_df2))
print(new_df2.head(20))

13716
    id  cont_attr1_x  cont_attr2_x  cont_attr3_x  cont_attr4_x  cont_attr5_x  \
0    0     35.281047     -4.042341     31.238706     93.202617     -5.105851   
1    1      8.003144    -16.664620     -8.661476     25.007860    -36.661550   
2    1      8.003144    -16.664620     -8.661476     25.007860    -36.661550   
3    2     19.574760     34.672005     54.246765     53.936899     91.680012   
4    3     44.817864      3.812980     48.630844    117.044660     14.532450   
5    4     37.351160     -3.556208     33.794952     98.377900     -3.890520   
6    5    -19.545558    -21.700502    -41.246060    -43.863894    -49.251256   
7    5    -19.545558    -21.700502    -41.246060    -43.863894    -49.251256   
8    5    -19.545558    -21.700502    -41.246060    -43.863894    -49.251256   
9    6     19.001768     19.445247     38.447015     52.504421     53.613117   
10   7     -3.027144     28.629434     25.602290     -2.567860     76.573584   
11   8     -2.064377     -4.901461

In [17]:
new_df3 = pd.merge(toy_df2, toy_df4, on='id', how='outer')
print(len(new_df3))

70173


In [18]:
new_df4 = pd.merge(new_df2, new_df3, on='id', how='outer')
print(len(new_df4))

85263
