In [37]:
import pomegranate
import torch
import numpy as np
import copy
import json
import pandas as pd
import collections
import time
import tools

In [29]:
class Single_BN():
    """
    Build a single Bayesian Network for a single table.
    Initialize with an appropriate table_name.
    """
    
    def __init__(self, table_name):
        self.table_name = table_name
        self.n_in_bin = dict()
        self.encoding = dict()
        self.mapping = dict()
        self.max_value = dict()
        self.model = None
        
        
    def build_discrete_table(self, table, n_mcv, n_bins, ignore_cols = []):
        """
        Discretize the entire table use bining (This is using histogram method for continuous data)
        ::Param:: table: original table
                  n_mcv: for categorical data we keep the top n most common values and bin the rest
                  n_bins: number of bins for histogram, larger n_bins will provide more accuracy but less efficiency
                  ignore_cols: drop the unnessary columns for example id attribute.
        """
        table = table.copy()
        for col in table.columns:
            if col in ignore_cols:
                table = table.drop(col, axis=1)
            else:
                table[col], self.n_in_bin[col], self.encoding[col], self.mapping[col] = tools.discretize_series(
                    table[col],
                    n_mcv=n_mcv,
                    n_bins=n_bins
                )
                self.max_value[col] = table[col].max()
        self.node_names = list(table.columns)
        return table

    def apply_encoding_to_value(self, value, col):
        #return the encoded value given real value
        if col not in self.encoding:
            return value
        elif value not in self.encoding[col]:
            return value
        return self.encoding[col][value]
    
    def apply_ndistinct_to_value(self, enc_value, value, col):
        #return the number of distinct value in the bin
        if col not in self.n_in_bin:
            return 1
        elif enc_value not in self.n_in_bin[col]:
            return 1
        elif type(self.n_in_bin[col][enc_value])==int:
            return 1/self.n_in_bin[col][enc_value]
        elif value not in self.n_in_bin[col][enc_value]:
            return 1
        else:
            return self.n_in_bin[col][enc_value][value]
    
    def build_from_data(self, dataset, n_mcv=30, n_bins=60, ignore_cols=['id'], algorithm="greedy", max_parents=-1, root=None, n_jobs=8):
        """ Build the Pomegranate model from data, including structure learning and paramter learning
            ::Param:: dataset: pandas.dataframe
                      n_mcv: for categorical data we keep the top n most common values and bin the rest
                      n_bins: number of bins for histogram, larger n_bins will provide more accuracy but less efficiency
            for other parameters, pomegranate gives a detailed explaination:
            https://pomegranate.readthedocs.io/en/latest/BayesianNetwork.html
        """
        self.nrows = len(dataset)
        self.algorithm = algorithm
        self.max_parents = max_parents
        self.n_mcv = n_mcv
        self.n_bins = n_bins
        self.root = root
        
        discrete_table = self.build_discrete_table(dataset, n_mcv, n_bins, ignore_cols)
        print(f'building pomegranate.BayesianNetwork from data with {self.nrows} rows')
        t = time.time()
        self.model = pomegranate.BayesianNetwork.from_samples(discrete_table,
                                                  algorithm=algorithm,
                                                  state_names=self.node_names,
                                                  max_parents=max_parents,
                                                  n_jobs=8,
                                                  root = self.root)
        print(f'Took {time.time() - t} secs.')
        
        

    def __str__(self):
        return f"bn{self.table_name}.{self.algorithm}-{self.max_parents}-{self.root}-{self.n_mcv}-{self.n_bins}"
    
    def load(self, path, pgm_path=None):
        with open(path, 'r') as myfile:
            json_model = myfile.read()
        self.model = BayesianNetwork.from_json(json_model)
            
    def save(self, path, pgm_path=None):
        with open(path, 'w') as outfile:
            outfile.write(self.model.to_json())
            
    
    def loopy_belief_propagation(self, evidence, n_distinct):
        p_estimates = 1
        for i in range(len(evidence)):
            val = evidence[i]
            if val is not None:
                evidence[i] = None
                dist = self.model.predict_proba(evidence)
                p = dist[i].parameters[0][val]*n_distinct[i]
                p_estimates *= p
        return p_estimates
            
    def infer_point_query(self, query, num_samples=1, return_prob=False):
        """Probability inference using learnt model. For example estimate P(X=x, Y=y, Z=z)
           ::Param:: query: dictionary of the form {X:x, Y:y, Z:z}
                     x,y,z can only be a single value
                     num_samples: how many times to run inference. Since Loopy belief propagation is sometime
                     an approaximation, we might to run it for multiple times and take the average.
                     return_prob: if true, return P(X=x, Y=y, Z=z)
                                  else return P(X=x, Y=y, Z=z)*nrows
        """
        ncols = len(query)
        nrows = self.nrows
        
        if num_samples == 1:
            #Using topological order to infer probability
            sampling_order = []
            while len(sampling_order) < len(self.model.structure):
                for i, deps in enumerate(self.model.structure):
                    if i in sampling_order:
                        continue  # already ordered
                    if all(d in sampling_order for d in deps):
                        sampling_order.append(i)
                    
        evidence = [None]*len(self.node_names)
        n_distinct = [1]*len(self.node_names)
        for attr in query:
            ind = self.node_names.index(attr)
            evidence[ind] = self.apply_encoding_to_value(query[attr], attr)
            n_distinct[ind] = self.apply_ndistinct_to_value(evidence[ind], query[attr], attr)
        
        for i in num_samples:
            
                
        
        if return_prob:
            return (p_estimates, nrows)
        return int(p_estimates * nrows)
        

In [3]:
def read_table_csv(n = 20):
    """
    Reads csv from path
    n: every nth line = 1% of the lines
    """
    filename = "/Users/ziniuwu/Desktop/research/imdb/title.csv"
    df = pd.read_csv(filename, header=0, escapechar='\\', encoding='utf-8', quotechar='"',
                          sep=',', skiprows=lambda i: i % n != 0)
    df.columns=['id', 'title', 'imdb_index', 'kind_id', 'production_year', 'imdb_id',
                                                'phonetic_code', 'episode_of_id', 'season_nr', 'episode_nr',
                                                'series_years', 'md5sum']
    for name in ['episode_of_id', 'title', 'imdb_index', 'phonetic_code', 'season_nr',
                                                  'imdb_id', 'episode_nr', 'series_years', 'md5sum']:
        df = df.drop(name, axis=1)
    df['random1'] = np.random.randint(10, size=len(df))
    df['random2'] = np.random.randint(3, size=len(df))+10
    df['random3'] = np.random.normal(3, 100, size=len(df))
    return df.apply(pd.to_numeric, errors="ignore")

In [30]:
df = read_table_csv(20)
print(len(df))
df.head(20)

126401


  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,id,kind_id,production_year,random1,random2,random3
0,122463,7,2006.0,3,10,129.582521
1,33079,7,2009.0,8,10,243.06462
2,111657,7,2011.0,2,11,-97.417777
3,51440,7,1999.0,3,12,-42.620029
4,67012,7,2007.0,8,10,58.652039
5,110274,7,1970.0,2,10,66.282334
6,91401,7,,2,10,124.914509
7,44624,7,1990.0,9,11,-41.182882
8,194183,2,2003.0,9,10,-59.472201
9,41025,7,2012.0,4,11,101.319601


In [31]:
BN = Single_BN('title')
BN.build_from_data(df)

building pomegranate.BayesianNetwork from data with 126401 rows
Took 2.62218976020813 secs.


In [32]:
def debug(evidence, BN):
    pred = BN.model.predict_proba([None, None, None, None, None])
    p = 1
    for (col, v) in evidence:
        i = BN.node_names.index(col)
        print(pred[i].parameters[0][BN.encoding[col][v]])
        p*=pred[i].parameters[0][BN.encoding[col][v]]
    return p

In [35]:
len(df.query('production_year == 1999').query('random1 == 2').query('random2 == 10'))

91

In [36]:
print(BN.infer_point_query({'production_year': 1999, 'random1':2, 'random2':10}))
print(debug([('production_year', 1999), ('random1', 2), ('random2',10)], BN))

82
0.019390669377615584
0.10040268668760531
0.3339372315092444
0.0006501341485049419


In [None]:
df = BN.build_discrete_table(BN.dataset, ignore_cols = ['id'])
df['random1'].nunique()

In [None]:
pred = BN.model.predict_proba([None, None, 5, None, None])

In [None]:
pred[2].parameters[0]

In [None]:
l = ['as','an','apple']

In [None]:
l.index('as')

In [None]:
c = {'as': 1, 'an': 2, 'apple':3}

In [None]:
while len(c)!=0:
    c.pop(key)
    print(c)