In [1]:
import logging
import argparse
import time
import sys
import numpy as np
import pandas as pd
import os
sys.path.append('/home/ziniu.wzn/FSPN/fspn')
os.makedirs('logs', exist_ok=True)
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)-5.5s]  %(message)s",
    handlers=[
        logging.FileHandler("logs/{}_{}.log".format("debug", time.strftime("%Y%m%d-%H%M%S"))),
        logging.StreamHandler()
    ])
logger = logging.getLogger(__name__)

In [2]:
from Evaluation.test_training import test_on_toy
from Structure.nodes import Context, bfs, Sum, Product, Factorize, Leaf, get_nodes_by_type, get_topological_order, get_parents
from Structure.StatisticalTypes import MetaType
from Learning.validity import is_valid
from Learning.learningWrapper import learn_FSPN_V2
from Inference.inference import prod_likelihood, sum_likelihood
from Structure.model import FSPN

In [3]:
def merge_leaves(node):
    """
    Convert a product node of only leaf children into a multivariate leaf for faster inference
    :param node: a product node
    :return: a leaf node
    """
    assert isinstance(node, Product), "incorrect parent node"
    scope = []
    for leaf in node.children:
        assert isinstance(leaf, Leaf), f"invalid children node type {type(leaf)}"
        scope.extend(leaf.scope)
    assert set(scope) == set(node.scope), "unmatched scope"
    new_node = Merge_leaves(node.children, node.scope, ranges = node.range)
    return new_node

class FSPN():

    def __init__(self):
        self.model = None
        self.ds_context = None

        # training stats
        self.learn_time = None
        self.rdc_threshold = None
        self.min_instances_slice = None
        self.pre_calculated = None


    def learn_from_data(self, train_data, ds_context, rdc_threshold=0.3, min_instances_slice=1,
                        max_sampling_threshold_cols=50000,
                        max_sampling_threshold_rows=500000, no_compression_scopes=None):

        # build domains (including the dependence analysis)
        learn_start_t = time.perf_counter()
        if min_instances_slice <= 1:
            min_instances_slice = round(len(train_data)*min_instances_slice)

        self.model = learn_FSPN_V2(train_data, ds_context, threshold=rdc_threshold,
                                   rdc_sample_size=max_sampling_threshold_cols)

        assert is_valid(self.model, check_ids=True)
        learn_end_t = time.perf_counter()
        self.learn_time = learn_end_t - learn_start_t
        logging.debug(f"Built SPN in {learn_end_t - learn_start_t} sec")

        # statistics
        self.rdc_threshold = rdc_threshold
        self.min_instances_slice = min_instances_slice

    def store_factorize_as_dict(self):
        """
        1. Store the factorize node in a dictionary like data structure with key = id
        2. Store all ranges of leave node w.r.t. a right branch of factorize node as array.
        """
        self.fact_node = dict()
        for fact_node in get_nodes_by_type(self.model, Factorize):
            self.fact_node[fact_node.id] = fact_node

        self.leaves = dict()
        self.leaves_condition = dict()
        self.leaves_range = dict()
        self.weak_connected_leaves = []
        parents = get_parents(self.model)
        for fact_id in self.fact_node:
            node = self.fact_node[fact_id]
            assert len(node.children) == 2, "invalid fspn"
            right_branch = node.children[1]
            assert right_branch.range is not None, "right branch of a fact node has no range"
            for r_prod in get_nodes_by_type(right_branch, Product):
                #merge the product node in right branch to a single leaf node
                (parent_node, pos) = parents[r_prod][0]
                new_leaf = merge_leaves(r_prod)
                parent_node.children[pos] = new_leaf
                del r_prod
            leave_condition = []
            for key in right_branch.range:
                leave_condition.append(key)
            leave = []
            leave_left_bound = []
            leave_right_bound = []
            for r_leaf in get_nodes_by_type(right_branch, Leaf):
                leave.append(r_leaf)
                assert r_leaf.range is not None, "right branch leaf of a fact node has no range"
                left_bound = []
                right_bound = []
                for attr in r_leaf.range:
                    lrange = r_leaf.range[attr][0]
                    if type(lrange) != tuple:
                        left_bound.append(lrange)
                        right_bound.append(lrange)
                    else:
                        left_bound.append(lrange[0])
                        right_bound.append(lrange[1])
                leave_left_bound.append(left_bound)
                leave_right_bound.append(right_bound)
            self.leaves_condition[fact_id] = leave_condition
            self.leaves[fact_id] = leave
            self.leaves_range[fact_id] = (np.asarray(leave_left_bound), np.asarray(leave_right_bound))

        for leaf in get_nodes_by_type(self.model, Leaf):
            is_weak = True
            for fact_id in self.leaves:
                if leaf in self.leaves[fact_id]:
                    is_weak = False
            if is_weak:
                self.weak_connected_leaves.append(leaf)


    def get_overlap(self, a, b):
        """
        Calculate the overlap of l1 and l2
        :param a: of shape (np.array(n,k), np.array(n,k))
        :param b: of shape (np.array(m,k), np.array(m,k))
        :return: l: of shape (np.array(m,n,k), np.array(m,n,k))
        _ can have multiple dimensions
        """
        al, ar = a
        bl, br = b
        (n, k) = al.shape
        left_res = np.zeros((bl.shape[0], n, k))
        right_res = np.zeros((br.shape[0], n, k))
        for i in range(n):
            left_res[:, i, :] = np.maximum(bl, al[i, :])
            right_res[:, i, :] = np.minimum(br, ar[i, :])
        return left_res.reshape((-1, k)), right_res.reshape((-1, k))


    def _probability_left_most(self, query, node, attr):
        """
            calculate the probability on spn without factorized node
        """
        nodes = get_topological_order(node)

        all_results = {}

        for n in nodes:
            if isinstance(n, Leaf):
                print(f"--------------------------------------------------------{n}")
                print(query)
                print(attr)
                print(n.scope)
                print(n.breaks)
                print(n.cdf)
                print(n.nan_perc)
                result = n.query(query, attr)
                print(result)
                print(f"-------------------------------------------------------------")
            else:
                tmp_children_list = []
                for i in range(len(n.children)):
                    ci = n.children[i]
                    tmp_children_list.append(all_results[ci])
                if isinstance(n, Sum):
                    result = sum_likelihood(n, tmp_children_list)
                elif isinstance(n, Product):
                    print(tmp_children_list)
                    result = prod_likelihood(n, tmp_children_list)
                else:
                    assert not isinstance(n, Factorize), "Factorize node should be eliminated"

            all_results[n] = result

        return all_results[node]

    def _spn_probability(self, query, node, query_attr, calculated):
        """
            Calculate the probability of a branch with all factorized node evaluated
            Node must contain a factorize node as children in this case.
        """
        assert query[0].shape[-1] == len(query_attr)
        if isinstance(node, Leaf):
            print(node, node.query(query, query_attr))
            return node.query(query, query_attr)
        elif isinstance(node, Factorize):
            print(node, calculated[node.id])
            return calculated[node.id]

        child_res = []
        for child in node.children:
            child_res.append(self._spn_probability(query, child, query_attr, calculated))
        if isinstance(node, Sum):
            return sum_likelihood(node, child_res)
        else:
            return prod_likelihood(node, child_res)


    def _leave_prob(self, query, fact_id, attr):
        """
            calculate a batch of range probability on leaves
            query of shape (n,k)
            output shape (n,m) where m is the number of leaves of a fact_node
        """
        leaves = self.leaves[fact_id]
        probs = np.zeros((len(query[0]), len(leaves)))
        for i, leaf in enumerate(leaves):
            probs[:, i] = leaf.query(query, attr)
        return probs


    def probability(self, query, node=None, query_attr=None, calculated=dict()):
        """
        Calculate the probability
        :param query: two numpy arrays of value refers to the lower and upper range of each attribute
        :param node: start the evaluation from this node
        :param query_attr: scope being queried with length k
        :param query_shape: the input query is flattened to n,k from query_shape _,k
        :return: probability of query, of shape n,
        """
        assert len(query) == 2, "incorrect query parser"
        if node is None:
            node = self.model
        if query_attr is None:
            query_attr = node.scope
        print(query_attr)
        scope = node.scope
        condition = [item for item in node.range] if node.range is not None else []
        assert query[0].shape[-1] == len(scope)+len(condition), "query length mismatch"
        assert set(query_attr) == set(scope+condition), "incorrect query_attr"

        #get the first factorize node in this branch
        first = None
        exist_fact = False
        for fact in get_nodes_by_type(node, Factorize):
            exist_fact = True
            if fact.id not in calculated:
                if first is None or fact.id < first.id:
                    first = fact
        if not exist_fact:
            print(node)
            #This node does not have factorize node children
            prob = self._probability_left_most(query, node, query_attr)
            return prob
        elif first is None:
            #We have evaluated all factorize node
            prob = self._spn_probability(query, node, query_attr, calculated)
            return prob
        else:
            prob = self.eval_fact_node(query, first, query_attr, calculated)
            calculated[first.id] = prob
            if node.id == first.id:
                #factorize node is the root node in the current branch
                return prob
            else:
                return self.probability(query, node, query_attr, calculated)


    def eval_fact_node(self, query, node, query_attr, calculated):
        """
        input query shape: n,k
        output probability: n,
        """
        right_branch = node.children[-1]
        scope = right_branch.scope
        condition = [item for item in right_branch.range]
        assert len(set(scope).intersection(set(condition))) == 0, "some scope conditioned on itself"

        scope_idx = [query_attr.index(item) for item in scope]
        condition_idx = [query_attr.index(item) for item in condition]
        scope_query = (query[0][:, scope_idx], query[1][:, scope_idx])
        print(scope_query)
        condition_query = (query[0][:, condition_idx], query[1][:, condition_idx])
        scope_prob = self._leave_prob(scope_query, node.id, scope)
        print(f"scope_prob of {node} is {scope_prob}, with sum {np.sum(scope_prob)}")
        new_query = self.get_overlap(self.leaves_range[node.id], condition_query)
        print(new_query)
        condition_prob = self.probability(new_query, node.children[0], condition, calculated)
        condition_prob = condition_prob.reshape((-1, len(self.leaves[node.id])))
        print(condition_prob.shape, len(self.leaves[node.id]))
        print(f"condition_prob of {node} is {condition_prob} with sum {np.sum(condition_prob)}")
        np.save("scope_prob2", scope_prob)
        np.save("cond_prob2", condition_prob)
        prob = np.sum(np.multiply(condition_prob, scope_prob), axis=1)
        prob[prob>1] = 1
        np.save("fact_prob2", prob)
        print(f"factorize node {node} probability {prob}")
        return prob

In [19]:
from Learning.learningWrapper import learn_FSPN_V2
from Structure.nodes import Context
from Structure.leaves.parametric.Parametric import Categorical, Gaussian
from Evaluation.toy_dataset import *

#data = toy_data_slightly_correlated_cat(nrows=1000000, seed=0)
#ds_context = Context(parametric_types=[Categorical, Categorical, Categorical, Categorical,
                                       #Categorical, Categorical, Categorical, Categorical]).add_domains(data)
data = pd.read_hdf("/home/ziniu.wzn/DMV/DMV_discretize.hdf")
#data = data.to_numpy()
sample_data = data.sample(n = 10000000)
sample_data = sample_data.to_numpy()
ds_context = Context(parametric_types=[Categorical, Categorical, Categorical, Categorical, Categorical, Categorical,
                                       Categorical, Categorical, Categorical, Categorical,]).add_domains(sample_data)
fspn = learn_FSPN_V2(sample_data, ds_context, rdc_sample_size=100000, threshold=0.2)

2020-07-21 16:34:18,385 [DEBUG]  Current task with data (10000000, 10) scope [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] and condition []
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
2020-07-21 16:34:35,007 [DEBUG]  OP: Operation.SPLIT_COLUMNS on slice (10000000, 10) (remaining tasks 0)
2020-07-21 16:34:35,461 [DEBUG]  		found 4 col clusters (in 0.45227 secs)
2020-07-21 16:34:35,462 [DEBUG]  Create an independent component with scope [0, 1, 2, 3, 4, 5, 6] and condition []
2020-07-21 16:34:35,463 [DEBUG]  Create an independent component with scope [7] and condition []
2020-07-21 16:34:35,463 [DEBUG]  Create an independent component with scope [8] and condition []
2020-07-21 16:34:35,464 [DEBUG]  Create an independent component with scope [9] and condition []
2020-07-21 16:34:35,464 [DEBUG]  Current task with data (10000000, 7) scope [0, 1, 2, 3, 4, 5, 6] and condition []
2020-07-21 16:34:56,946 [DEBUG]  OP: Operation.FACTORIZE on slice (10000000, 7) (remaining tasks 3)
2020-0

2020-07-21 16:38:36,781 [DEBUG]  OP: Operation.SPLIT_COLUMNS on slice (5472906, 2) (remaining tasks 7)
2020-07-21 16:38:37,316 [DEBUG]  		found 2 col clusters (in 0.53407 secs)
2020-07-21 16:38:37,317 [DEBUG]  Create an independent component with scope [2] and condition []
2020-07-21 16:38:37,318 [DEBUG]  Create an independent component with scope [3] and condition []
2020-07-21 16:38:37,326 [DEBUG]  Current task with data (2035883, 1) scope [2] and condition []
2020-07-21 16:38:37,327 [DEBUG]  OP: Operation.CREATE_LEAF on slice (2035883, 1) (remaining tasks 8)
2020-07-21 16:38:37,622 [DEBUG]  		 created leaf Histogram for scope=[2] and condition=[] (in 0.29492 secs)
2020-07-21 16:38:37,623 [DEBUG]  Current task with data (2035883, 1) scope [3] and condition []
2020-07-21 16:38:37,624 [DEBUG]  OP: Operation.CREATE_LEAF on slice (2035883, 1) (remaining tasks 7)
2020-07-21 16:38:37,835 [DEBUG]  		 created leaf Histogram for scope=[3] and condition=[] (in 0.21034 secs)
2020-07-21 16:38:37

2020-07-21 16:42:48,903 [DEBUG]  		found 2 row clusters (in 0.16227 secs)
2020-07-21 16:42:48,908 [DEBUG]  Current task with data (937944, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:43:01,346 [DEBUG]  OP: Operation.SPLIT_ROWS_CONDITION on slice (937944, 5) (remaining tasks 14)
2020-07-21 16:43:01,348 [INFO ]  find optimal attribute: 0
2020-07-21 16:43:01,470 [INFO ]  find optimal clusters: [{4: [(1900.0, 2005.0)]}, {4: [(2006.0, 2012.0)]}]
2020-07-21 16:43:01,638 [DEBUG]  		found 2 row clusters (in 0.29046 secs)
2020-07-21 16:43:01,643 [DEBUG]  Current task with data (1053130, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:43:15,254 [DEBUG]  OP: Operation.REMOVE_CONDITION on slice (1053130, 5) (remaining tasks 15)
2020-07-21 16:43:15,256 [DEBUG]  Removed uniformative condition [4]
2020-07-21 16:43:15,367 [DEBUG]  Current task with data (723073, 6) scope [0, 1, 5, 6] and condition [3, 4]
2020-07-21 16:43:15,646 [DEBUG]  OP: Operation.REMOVE_UNINFORMATIVE_FEATURES on sl

2020-07-21 16:46:22,918 [DEBUG]  Current task with data (623504, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:46:35,720 [DEBUG]  OP: Operation.SPLIT_ROWS_CONDITION on slice (623504, 5) (remaining tasks 21)
2020-07-21 16:46:35,721 [INFO ]  find optimal attribute: 0
  clusters[np.where(data < m)] = 0
  clusters[np.where(data >= m)] = 1
2020-07-21 16:46:35,772 [INFO ]  find optimal clusters: [{4: [(1900.0, 2012.0)]}, {4: [(2013.0, 2021.0)]}]
2020-07-21 16:46:35,812 [DEBUG]  		found 2 row clusters (in 0.09086 secs)
2020-07-21 16:46:35,813 [DEBUG]  Current task with data (580153, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:46:48,346 [DEBUG]  OP: Operation.SPLIT_ROWS_CONDITION on slice (580153, 5) (remaining tasks 22)
2020-07-21 16:46:48,347 [INFO ]  find optimal attribute: 0
2020-07-21 16:46:48,398 [INFO ]  find optimal clusters: [{4: [(1903.0, 2013.0)]}, {4: [(2014.0, 2021.0)]}]
2020-07-21 16:46:48,436 [DEBUG]  		found 2 row clusters (in 0.08880 secs)
2020-07-21 16:46:48

2020-07-21 16:49:14,304 [DEBUG]  Removed uniformative condition [4]
2020-07-21 16:49:14,313 [DEBUG]  Current task with data (215718, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:49:26,446 [DEBUG]  OP: Operation.SPLIT_ROWS_CONDITION on slice (215718, 5) (remaining tasks 29)
2020-07-21 16:49:26,448 [INFO ]  find optimal attribute: 0
2020-07-21 16:49:26,470 [INFO ]  find optimal clusters: [{4: [(1900.0, 1996.0)]}, {4: [(1997.0, 2001.0)]}]
2020-07-21 16:49:26,484 [DEBUG]  		found 2 row clusters (in 0.03676 secs)
2020-07-21 16:49:26,486 [DEBUG]  Current task with data (218708, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:49:38,692 [DEBUG]  OP: Operation.REMOVE_CONDITION on slice (218708, 5) (remaining tasks 30)
2020-07-21 16:49:38,694 [DEBUG]  Removed uniformative condition [4]
2020-07-21 16:49:38,698 [DEBUG]  Current task with data (503518, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 16:49:38,698 [DEBUG]  OP: Operation.CREATE_LEAF on slice (503518, 4) (remaining tas

2020-07-21 16:52:05,884 [INFO ]  find optimal attribute: 0
  clusters[np.where(data < m)] = 0
  clusters[np.where(data >= m)] = 1
2020-07-21 16:52:05,901 [INFO ]  find optimal clusters: [{4: [(1900.0, 2012.0)]}, {4: [(2013.0, 2021.0)]}]
2020-07-21 16:52:05,914 [DEBUG]  		found 2 row clusters (in 0.02938 secs)
2020-07-21 16:52:05,914 [DEBUG]  Current task with data (189343, 6) scope [0, 1, 5, 6] and condition [3, 4]
2020-07-21 16:52:05,926 [DEBUG]  OP: Operation.REMOVE_UNINFORMATIVE_FEATURES on slice (189343, 6) (remaining tasks 38)
2020-07-21 16:52:05,927 [DEBUG]  find uninformation condition, keeping only condition [4]
2020-07-21 16:52:05,931 [DEBUG]  Current task with data (321171, 6) scope [0, 1, 5, 6] and condition [3, 4]
2020-07-21 16:52:21,962 [DEBUG]  OP: Operation.REMOVE_CONDITION on slice (321171, 6) (remaining tasks 38)
2020-07-21 16:52:21,964 [DEBUG]  Removed uniformative condition [3]
2020-07-21 16:52:21,972 [DEBUG]  Current task with data (580456, 5) scope [0, 1, 5, 6] and

2020-07-21 16:55:10,281 [DEBUG]  Current task with data (196461, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:55:22,663 [DEBUG]  OP: Operation.SPLIT_ROWS_CONDITION on slice (196461, 5) (remaining tasks 42)
2020-07-21 16:55:22,664 [INFO ]  find optimal attribute: 0
2020-07-21 16:55:22,681 [INFO ]  find optimal clusters: [{4: [(2017.0, 2017.0)]}, {4: [(2018.0, 2021.0)]}]
2020-07-21 16:55:22,693 [DEBUG]  		found 2 row clusters (in 0.02900 secs)
2020-07-21 16:55:22,694 [DEBUG]  Current task with data (122258, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:55:34,778 [DEBUG]  OP: Operation.SPLIT_ROWS_CONDITION on slice (122258, 5) (remaining tasks 43)
2020-07-21 16:55:34,780 [INFO ]  find optimal attribute: 0
2020-07-21 16:55:34,795 [INFO ]  find optimal clusters: [{4: [(1903.0, 2002.0)]}, {4: [(2003.0, 2006.0)]}]
2020-07-21 16:55:34,806 [DEBUG]  		found 2 row clusters (in 0.02622 secs)
2020-07-21 16:55:34,807 [DEBUG]  Current task with data (157152, 5) scope [0, 1, 5, 6] and

2020-07-21 16:57:59,540 [INFO ]  find optimal attribute: 0
2020-07-21 16:57:59,562 [INFO ]  find optimal clusters: [{4: [(1900.0, 2006.0)]}, {4: [(2007.0, 2012.0)]}]
2020-07-21 16:57:59,575 [DEBUG]  		found 2 row clusters (in 0.03557 secs)
2020-07-21 16:57:59,576 [DEBUG]  Current task with data (218966, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:58:11,709 [DEBUG]  OP: Operation.REMOVE_CONDITION on slice (218966, 5) (remaining tasks 50)
2020-07-21 16:58:11,710 [DEBUG]  Removed uniformative condition [4]
2020-07-21 16:58:11,714 [DEBUG]  Current task with data (90445, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:58:21,556 [DEBUG]  OP: Operation.CREATE_LEAF on slice (90445, 5) (remaining tasks 50)
2020-07-21 16:58:21,583 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[4] (in 0.02439 secs)
2020-07-21 16:58:21,584 [DEBUG]  Current task with data (102486, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 16:58:33,510 [DEBUG]  OP: Operation.R

2020-07-21 17:00:45,587 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[4] (in 0.01650 secs)
2020-07-21 17:00:45,588 [DEBUG]  Current task with data (112629, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 17:00:45,591 [DEBUG]  OP: Operation.CREATE_LEAF on slice (112629, 4) (remaining tasks 47)
2020-07-21 17:00:45,624 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.03242 secs)
2020-07-21 17:00:45,625 [DEBUG]  Current task with data (132339, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 17:00:45,626 [DEBUG]  OP: Operation.CREATE_LEAF on slice (132339, 4) (remaining tasks 46)
2020-07-21 17:00:45,666 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.03926 secs)
2020-07-21 17:00:45,667 [DEBUG]  Current task with data (57728, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 17:00:45,672 [DEBUG]  OP: Operation.REMOVE_UNINFORMATIVE_FEATURES on slice (57728, 5) (remaining tasks 45

2020-07-21 17:03:03,985 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.02578 secs)
2020-07-21 17:03:03,986 [DEBUG]  Current task with data (53819, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 17:03:08,502 [DEBUG]  OP: Operation.CREATE_LEAF on slice (53819, 5) (remaining tasks 42)
2020-07-21 17:03:08,519 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[4] (in 0.01557 secs)
2020-07-21 17:03:08,520 [DEBUG]  Current task with data (70859, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 17:03:15,947 [DEBUG]  OP: Operation.REMOVE_CONDITION on slice (70859, 5) (remaining tasks 41)
2020-07-21 17:03:15,949 [DEBUG]  Removed uniformative condition [4]
2020-07-21 17:03:15,950 [DEBUG]  Current task with data (102940, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 17:03:27,130 [DEBUG]  OP: Operation.SPLIT_ROWS_CONDITION on slice (102940, 5) (remaining tasks 41)
2020-07-21 17:03:27,131 [INFO ]  find optimal attribute: 

2020-07-21 17:05:11,239 [DEBUG]  OP: Operation.CREATE_LEAF on slice (97001, 4) (remaining tasks 40)
2020-07-21 17:05:11,264 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.02412 secs)
2020-07-21 17:05:11,265 [DEBUG]  Current task with data (99463, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 17:05:11,265 [DEBUG]  OP: Operation.CREATE_LEAF on slice (99463, 4) (remaining tasks 39)
2020-07-21 17:05:11,289 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.02373 secs)
2020-07-21 17:05:11,290 [DEBUG]  Current task with data (67922, 6) scope [0, 1, 5, 6] and condition [2, 3]
2020-07-21 17:05:11,294 [DEBUG]  OP: Operation.REMOVE_UNINFORMATIVE_FEATURES on slice (67922, 6) (remaining tasks 38)
2020-07-21 17:05:11,295 [DEBUG]  find uninformation condition, keeping only condition [2]
2020-07-21 17:05:11,296 [DEBUG]  Current task with data (77226, 6) scope [0, 1, 5, 6] and condition [2, 3]
2020-07-21 17:05:20,654 [DEBU

2020-07-21 17:06:03,844 [DEBUG]  Current task with data (59560, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 17:06:09,042 [DEBUG]  OP: Operation.REMOVE_CONDITION on slice (59560, 5) (remaining tasks 26)
2020-07-21 17:06:09,043 [DEBUG]  Removed uniformative condition [4]
2020-07-21 17:06:09,045 [DEBUG]  Current task with data (111492, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 17:06:09,045 [DEBUG]  OP: Operation.CREATE_LEAF on slice (111492, 4) (remaining tasks 26)
2020-07-21 17:06:09,079 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.03288 secs)
2020-07-21 17:06:09,080 [DEBUG]  Current task with data (128900, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 17:06:09,080 [DEBUG]  OP: Operation.CREATE_LEAF on slice (128900, 4) (remaining tasks 25)
2020-07-21 17:06:09,115 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.03433 secs)
2020-07-21 17:06:09,116 [DEBUG]  Current task with data (144739, 4)

2020-07-21 17:07:04,186 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.02218 secs)
2020-07-21 17:07:04,187 [DEBUG]  Current task with data (61179, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 17:07:04,187 [DEBUG]  OP: Operation.CREATE_LEAF on slice (61179, 4) (remaining tasks 11)
2020-07-21 17:07:04,204 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.01548 secs)
2020-07-21 17:07:04,205 [DEBUG]  Current task with data (38969, 4) scope [0, 1, 5, 6] and condition []
2020-07-21 17:07:04,205 [DEBUG]  OP: Operation.CREATE_LEAF on slice (38969, 4) (remaining tasks 10)
2020-07-21 17:07:04,215 [DEBUG]  		 created leaf Multi_histogram for scope=[0, 1, 5, 6] and condition=[] (in 0.00905 secs)
2020-07-21 17:07:04,216 [DEBUG]  Current task with data (46037, 5) scope [0, 1, 5, 6] and condition [4]
2020-07-21 17:07:04,219 [DEBUG]  OP: Operation.REMOVE_UNINFORMATIVE_FEATURES on slice (46037, 5) (remaining tasks 9)
2020

---Structure Statistics---
# nodes               167
    # sum nodes       77
    # factorize nodes 1
    # prod nodes      4
    # leaf nodes      85
# params              154
# edges               164
# layers              12


In [20]:
model = FSPN()
model.model = fspn
model.store_factorize_as_dict()

In [None]:
model3 = FSPN()
model3.model = model2.model
model3.store_factorize_as_dict()

In [21]:
query_l = np.load("/home/ziniu.wzn/DMV/cardinality/query_left.npy")
query_r = np.load("/home/ziniu.wzn/DMV/cardinality/query_right.npy")
true_card = np.load("/home/ziniu.wzn/DMV/cardinality/query_true.npy")
print(true_card.shape)
print(query_l.shape)
print(query_r.shape)

(1986,)
(1986, 10)
(1986, 10)


In [22]:
tic = time.time()
card = model.probability((query_l, query_r), calculated=dict()) * 11575483
print(time.time()-tic)
card[card==0.0] = 1
errors = np.maximum(np.divide(card, true_card), np.divide(true_card, card))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
(array([[-inf,  23.,   2.,   0.],
       [  0., -inf,   1., -inf],
       [  0.,   6., -inf,   0.],
       ...,
       [  0.,   0.,   0.,   0.],
       [-inf,   0., -inf, -inf],
       [-inf,   0., -inf,   0.]]), array([[inf, 52., 11.,  2.],
       [ 1., inf, 14., inf],
       [ 1., 41., inf,  3.],
       ...,
       [ 1., 25., 11.,  1.],
       [inf, 16., inf, inf],
       [inf,  5., inf,  1.]]))
scope_prob of FactorizeNode_1 is [[5.96317644e-04 7.88452449e-04 9.69328968e-04 ... 2.71391984e-03
  2.42925709e-03 1.70926890e-03]
 [4.00383618e-01 5.51857133e-01 5.42920241e-01 ... 4.17601359e-01
  4.69523866e-01 3.95449719e-01]
 [7.00350384e-02 6.04486831e-02 5.58095726e-02 ... 1.31044431e-01
  7.42322075e-01 1.18211483e-01]
 ...
 [9.47249627e-01 9.34091731e-01 8.92377051e-01 ... 8.76663346e-01
  9.27033951e-01 9.01574599e-01]
 [9.86711992e-01 9.88467145e-01 9.89991221e-01 ... 9.85061216e-01
  9.82936309e-01 9.87530106e-01]
 [9.06485429e-01 9.13740919e-01 9.0

In [None]:
tic = time.time()
card2 = model3.probability((query_l, query_r), calculated=dict()) * 11575483
print(time.time()-tic)
card2[card2==0.0] = 1
errors2 = np.maximum(np.divide(card2, true_card), np.divide(true_card, card2))

In [None]:
model.leaves[1]

In [None]:
import matplotlib.pyplot as plt
x = np.arange(len(errors[errors<20]))
idx = np.argsort(errors[errors<20])
plt.plot(x, errors[errors<20][idx])
plt.show()
plt.plot(x, errors2[errors<20][idx])
plt.show()

In [23]:
for i in [50,90,95,99,100]:
    print(np.percentile(errors, i))

1.003694044625834
1.0567153252265475
1.1100142114948173
1.331793185620488
8.77622022143622


In [24]:
import pickle
pickle.dump(model, open('fspn167.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

In [None]:
with open('fspn20.pkl', 'rb') as f:
    model2 = pickle.load(f)

In [None]:
with np.printoptions(precision=3, suppress=True):
    print(model.leaves_range[1])

In [None]:
print(model.leaves_condition[1])
print(model2.leaves_condition[1])

In [None]:
model2.leaves_range[1]

In [None]:
fact_prob1 = np.load("fact_prob1.npy")
scope_prob1 = np.load("scope_prob1.npy")
condition_prob1 = np.load("cond_prob1.npy")
prob2 = np.load("fact_prob2.npy")
scope_prob2 = np.load("scope_prob2.npy")
condition_prob2 = np.load("cond_prob2.npy")

In [None]:
print(scope_prob1.shape)
print(scope_prob2.shape)
print(condition_prob1.shape)
print(condition_prob2.shape)

In [None]:
print(np.max(np.sum(condition_prob2, axis=1)))

In [None]:
print(np.max(np.sum(condition_prob1, axis=1)))

In [None]:
np.sum(condition_prob1[11])

In [None]:
import matplotlib.pyplot as plt
x = np.arange(len(prob1))
idx = np.argsort(prob1)
plt.plot(x, prob1[idx], color="red")
plt.plot(x, prob2[idx])
plt.show()

In [None]:
import matplotlib.pyplot as plt
x = np.arange(len(prob1))
sum1 = np.sum(condition_prob1, axis=1)
sum2 = np.sum(condition_prob2, axis=1)
idx = np.argsort(sum1)
plt.plot(x, sum1[idx], color="red")
plt.plot(x, sum2[idx])
plt.show()

In [None]:
model.leaves_range[1]

In [None]:
model3.leaves_range[1]

In [None]:
print(model.weak_connected_leaves)
print(model3.weak_connected_leaves)

In [None]:
card2 = model.probability((query_l[[9]], query_r[[9]]), calculated=dict()) * 11575483

In [None]:
query_l[[9]], query_r[[9]]

In [None]:
card2 = model.probability((query_l[[9]], query_r[[9]]), calculated=dict()) * 11575483

In [None]:
f = get_nodes_by_type(model.model, Factorize)

In [None]:
n = f[0].children[0]

In [None]:
n.scope

In [None]:
temp_l = np.zeros((6,2))-np.infty
temp_r = np.zeros((6,2))+np.infty
val = 0
for i in range(5):
    temp_l[i+1, 0] = val+0.99
    temp_r[i, 0] = val
    val+=1

In [None]:
temp_l, temp_r

In [None]:
p = model.probability((temp_l, temp_r), n, calculated=dict())

In [None]:
np.sum(p)

In [None]:
np.unique(sample_data[:,2])

In [None]:
np.sum(sample_data[:,2]==0)
np.sum(sample_data[:,2]==1)