In [2]:
import pomegranate
import torch
import numpy as np
import copy
import random
import json
import pandas as pd
import collections
import time
import math
import itertools

In [4]:
import sys
sys.path.append('/Users/ziniuwu/Desktop/research/BayesNet')
from  import tools

['/Users/ziniuwu/Desktop/research/BayesNet/Testing', '/Users/ziniuwu/anaconda3/envs/BayesNet/lib/python37.zip', '/Users/ziniuwu/anaconda3/envs/BayesNet/lib/python3.7', '/Users/ziniuwu/anaconda3/envs/BayesNet/lib/python3.7/lib-dynload', '', '/Users/ziniuwu/anaconda3/envs/BayesNet/lib/python3.7/site-packages', '/Users/ziniuwu/anaconda3/envs/BayesNet/lib/python3.7/site-packages/IPython/extensions', '/Users/ziniuwu/.ipython']


In [2]:
class Single_BN():
    """
    Build a single Bayesian Network for a single table.
    Initialize with an appropriate table_name.
    """
    
    def __init__(self, table_name, method='Pome'):
        self.table_name = table_name
        self.n_in_bin = dict()
        self.encoding = dict()
        self.mapping = dict()
        self.max_value = dict()
        self.model = None
        self.method = method
        
        
    def build_discrete_table(self, data, n_mcv, n_bins, drop_na=True, ignore_cols = []):
        """
        Discretize the entire table use bining (This is using histogram method for continuous data)
        ::Param:: table: original table
                  n_mcv: for categorical data we keep the top n most common values and bin the rest
                  n_bins: number of bins for histogram, larger n_bins will provide more accuracy but less efficiency
                  drop_na: if True, we drop all rows with nan in it
                  ignore_cols: drop the unnessary columns for example id attribute
        """
        table = data.copy()
        if drop_na:
            table = table.dropna()
        for col in table.columns:
            if col in ignore_cols:
                table = table.drop(col, axis=1)
            else:
                table[col], self.n_in_bin[col], self.encoding[col], self.mapping[col] = tools.discretize_series(
                    table[col],
                    n_mcv=n_mcv,
                    n_bins=n_bins,
                    drop_na= not drop_na
                )
                self.max_value[col] = int(table[col].max())+1
        self.node_names = list(table.columns)
        return table

    
    def apply_encoding_to_value(self, value, col):
        #return the encoded value given real value
        if col not in self.encoding:
            return value
        elif value not in self.encoding[col]:
            return value
        return self.encoding[col][value]
    
    
    def apply_ndistinct_to_value(self, enc_value, value, col):
        #return the number of distinct value in the bin
        if col not in self.n_in_bin:
            return 1
        elif enc_value not in self.n_in_bin[col]:
            return 1
        elif type(self.n_in_bin[col][enc_value])==int:
            return 1/self.n_in_bin[col][enc_value]
        elif value not in self.n_in_bin[col][enc_value]:
            return 1
        else:
            return self.n_in_bin[col][enc_value][value]
    
    def build_from_data(self, dataset, n_mcv=30, n_bins=60, ignore_cols=['id'], algorithm="greedy",
                        drop_na=True, max_parents=-1, root=None, n_jobs=8):
        """ Build the Pomegranate model from data, including structure learning and paramter learning
            ::Param:: dataset: pandas.dataframe
                      n_mcv: for categorical data we keep the top n most common values and bin the rest
                      n_bins: number of bins for histogram, larger n_bins will provide more accuracy but less efficiency
            for other parameters, pomegranate gives a detailed explaination:
            https://pomegranate.readthedocs.io/en/latest/BayesianNetwork.html
        """
        self.nrows = len(dataset)
        self.algorithm = algorithm
        self.max_parents = max_parents
        self.n_mcv = n_mcv
        self.n_bins = n_bins
        self.root = root
        
        discrete_table = self.build_discrete_table(dataset, n_mcv, n_bins, drop_na, ignore_cols)
        print(f'building pomegranate.BayesianNetwork from data with {self.nrows} rows')
        t = time.time()
        self.model = pomegranate.BayesianNetwork.from_samples(discrete_table,
                                                  algorithm=algorithm,
                                                  state_names=self.node_names,
                                                  max_parents=max_parents,
                                                  n_jobs=8,
                                                  root = self.root)
        print(f'Took {time.time() - t} secs.')
        
        

    def __str__(self):
        return f"bn{self.table_name}.{self.algorithm}-{self.max_parents}-{self.root}-{self.n_mcv}-{self.n_bins}"
    
    
    def load(self, path, pgm_path=None):
        with open(path, 'r') as myfile:
            json_model = myfile.read()
        self.model = BayesianNetwork.from_json(json_model)
            
            
    def save(self, path, pgm_path=None):
        with open(path, 'w') as outfile:
            outfile.write(self.model.to_json())
            
    
    def loopy_belief_propagation(self, evidence, n_distinct):
        """Performance a LBP in random order.
           This adapts the LBP implemented in pomegranate package itself.
        """
        index = list(range(len(self.node_names)))
        p_estimate = 1
        
        while len(index)!=0:
            i = random.choice(index)
            val = evidence[i]
            if val is not None:
                evidence[i] = None
                dist = self.model.predict_proba(evidence)
                p = dist[i].parameters[0][val]*n_distinct[i]
                p_estimate *= p
            index.remove(i)
        return p_estimate
        
            
    def infer_point_query_LBP(self, query, num_samples=1, return_prob=False):
        """Probability inference using Loopy belief propagation. For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:x, Y:y, Z:z}
                     x,y,z can only be a single value
                     num_samples: how many times to run inference. Since Loopy belief propagation is sometime
                     an approaximation, we might to run it for multiple times and take the average.
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
        """
        ncols = len(query)
        nrows = self.nrows
                    
        evidence = [None]*len(self.node_names)
        n_distinct = [1]*len(self.node_names)
        for attr in query:
            ind = self.node_names.index(attr)
            evidence[ind] = self.apply_encoding_to_value(query[attr], attr)
            n_distinct[ind] = self.apply_ndistinct_to_value(evidence[ind], query[attr], attr)
        
        if num_samples == 1:
            #Using topological order to infer probability
            sampling_order = []
            while len(sampling_order) < len(self.model.structure):
                for i, deps in enumerate(self.model.structure):
                    if i in sampling_order:
                        continue  # already ordered
                    if all(d in sampling_order for d in deps):
                        sampling_order.append(i)
            
            p_estimate = 1
            for i in sampling_order:
                val = evidence[i]
                if val is not None:
                    evidence[i] = None
                    dist = self.model.predict_proba(evidence)
                    p = dist[i].parameters[0][val]*n_distinct[i]
                    p_estimate *= p
                        
        else:
            p_estimates = []
            for i in range(num_samples):
                copy_evidence = copy.deepcopy(evidence)
                p_estimates.append(self.loopy_belief_propagation(copy_evidence, n_distinct))
            p_estimate = sum(p_estimates)/num_samples
        
        if return_prob:
            return (p_estimate, nrows)
        return int(p_estimate * nrows)
    
    
    def infer_range_query_LBP(self, query, num_samples=1, return_prob=False):
        """Probability inference using Loopy belief propagation. For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:[x], Y:[y], Z:[z]}
                     x,y,z can only be set of single value
                     num_samples: how many times to run inference. Since Loopy belief propagation is sometime
                     an approaximation, we might to run it for multiple times and take the average.
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
           LBP for estimating range query can be really slow
        """
        def cartesian_product(d):
            target_list = []
            for key in d:
                val = d[key]
                if type(val) != list:
                    val = [val]
                target_list.append(val)
            return itertools.product(*target_list)
        
        
        p_estimate = 0
        for query_tuple in cartesian_product(query):
            point_query = dict()
            i = 0
            for attr in query:
                point_query[attr] = query_tuple[i]
                i += 1
            p_estimate += self.infer_point_query_LBP(point_query, return_prob=True)[0]
        
        return p_estimate*self.nrows
            

In [3]:
def read_table_csv(n = 20):
    """
    Reads csv from path
    n: every nth line = 1% of the lines
    """
    filename = "/Users/ziniuwu/Desktop/research/imdb/title.csv"
    df = pd.read_csv(filename, header=0, escapechar='\\', encoding='utf-8', quotechar='"',
                          sep=',', skiprows=lambda i: i % n != 0)
    df.columns=['id', 'title', 'imdb_index', 'kind_id', 'production_year', 'imdb_id',
                                                'phonetic_code', 'episode_of_id', 'season_nr', 'episode_nr',
                                                'series_years', 'md5sum']
    for name in ['episode_of_id', 'title', 'imdb_index', 'phonetic_code', 'season_nr',
                                                  'imdb_id', 'episode_nr', 'series_years', 'md5sum']:
        df = df.drop(name, axis=1)
    df['random1'] = np.random.randint(10, size=len(df))
    df['random2'] = np.random.randint(3, size=len(df))+10
    df['random3'] = np.random.normal(3, 100, size=len(df))
    return df.apply(pd.to_numeric, errors="ignore")

In [4]:
df = read_table_csv(200)
print(len(df))

12639


In [5]:
BN = Single_BN('title')
BN.build_from_data(df)

building pomegranate.BayesianNetwork from data with 12639 rows
Took 0.27854204177856445 secs.


In [None]:
def debug(evidence, BN):
    pred = BN.model.predict_proba([None, None, None, None, None])
    p = 1
    for (col, v) in evidence:
        i = BN.node_names.index(col)
        print(pred[i].parameters[0][BN.encoding[col][v]])
        p*=pred[i].parameters[0][BN.encoding[col][v]]
    return p

In [None]:
len(df.query('production_year in [2011,2012]').query('random1 == 2').query('random2 == [10,11]'))

In [None]:
print(BN.infer_range_query_LBP({'production_year': [2011,2012], 'random1':2, 'random2':[10,11]}, num_samples=1))

In [None]:
BN.model.structure

In [None]:
import itertools

somelists = [
   [1, 2, 3],
   ['a', 'b'],
   [4, 5]
]
for element in itertools.product(*somelists):
    print(element)

In [11]:
-np.Inf > 0

False

In [10]:
np.Inf > 0

True