In [1]:
import pickle
import numpy as np
import pandas as pd
import copy
import os
import sys
sys.path.append("/home/ubuntu/CE_scheme/")
from Schemas.imdb.schema import gen_imdb_schema
from Join_scheme.data_prepare import read_table_csv
from Join_scheme.bound import Bound_ensemble
from Join_scheme.join_graph import parse_query_all_join, get_join_hyper_graph

In [2]:
import copy
import logging
import pickle
import numpy as np
import pandas as pd
import time

from Schemas.imdb.schema import gen_imdb_schema
from Join_scheme.binning import identify_key_values, sub_optimal_bucketize, Table_bucket
from Join_scheme.binning import apply_binning_to_data_value_count
from Join_scheme.bound import Factor

logger = logging.getLogger(__name__)


def timestamp_transorform(time_string, start_date="2010-07-19 00:00:00"):
    start_date_int = time.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    time_array = time.strptime(time_string, "%Y-%m-%d %H:%M:%S")
    return int(time.mktime(time_array)) - int(time.mktime(start_date_int))


def read_table_hdf(table_obj):
    """
    Reads hdf from path, renames columns and drops unnecessary columns
    """
    df_rows = pd.read_hdf(table_obj.csv_file_location)
    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    return df_rows.apply(pd.to_numeric, errors="ignore")


def read_table_csv(table_obj, csv_seperator=',', stats=True):
    """
    Reads csv from path, renames columns and drops unnecessary columns
    """
    if stats:
        df_rows = pd.read_csv(table_obj.csv_file_location)
    else:
        df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8',
                              quotechar='"',
                              sep=csv_seperator)
    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    return df_rows.apply(pd.to_numeric, errors="ignore")


def make_sample(np_data, nrows=1000000, seed=0):
    np.random.seed(seed)
    samp_data = np_data[np_data != -1]
    if len(samp_data) <= nrows:
        return samp_data, 1.0
    else:
        selected = np.random.choice(len(samp_data), size=nrows, replace=False)
        return samp_data[selected], nrows/len(samp_data)


def stats_analysis(sample, data, sample_rate, show=10):
    n, c = np.unique(sample, return_counts=True)
    idx = np.argsort(c)[::-1]
    for i in range(min(show, len(idx))):
        print(c[idx[i]], c[idx[i]]/sample_rate, len(data[data == n[idx[i]]]))


def get_ground_truth_no_filter(equivalent_keys, data, bins, table_lens, na_values):
    all_factor_pdfs = dict()
    for PK in equivalent_keys:
        bin_value = bins[PK]
        for key in equivalent_keys[PK]:
            table = key.split(".")[0]
            temp = apply_binning_to_data_value_count(bin_value, data[key])
            if table not in all_factor_pdfs:
                all_factor_pdfs[table] = dict()
            all_factor_pdfs[table][key] = temp / np.sum(temp)

    all_factors = dict()
    for table in all_factor_pdfs:
        all_factors[table] = Factor(table, table_lens[table], list(all_factor_pdfs[table].keys()),
                                    all_factor_pdfs[table], na_values[table])
    return all_factors


def process_imdb_data(data_path, model_folder, n_bins, sample_size=100000, save_bucket_bins=False):
    schema = gen_imdb_schema(data_path)
    all_keys, equivalent_keys = identify_key_values(schema)
    data = dict()
    table_lens = dict()
    na_values = dict()
    primary_keys = []
    for table_obj in schema.tables:
        df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8',
                              quotechar='"',
                              sep=",")

        df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]
        for attribute in table_obj.irrelevant_attributes:
            df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

        df_rows.apply(pd.to_numeric, errors="ignore")
        table_lens[table_obj.table_name] = len(df_rows)
        if table_obj.table_name not in na_values:
            na_values[table_obj.table_name] = dict()
        for attr in df_rows.columns:
            if attr in all_keys:
                data[attr] = df_rows[attr].values
                data[attr][np.isnan(data[attr])] = -1
                data[attr][data[attr] < 0] = -1
                na_values[table_obj.table_name][attr] = len(data[attr][data[attr] != -1]) / table_lens[
                    table_obj.table_name]
                data[attr] = copy.deepcopy(data[attr])[data[attr] >= 0]
                if len(np.unique(data[attr])) >= len(data[attr]) - 10:
                    primary_keys.append(attr)

    sample_rate = dict()
    sampled_data = dict()
    for k in data:
        temp = make_sample(data[k], 1000000)
        sampled_data[k] = temp[0]
        sample_rate[k] = temp[1]

    optimal_buckets = dict()
    bin_size = dict()
    all_bin_modes = dict()
    for PK in equivalent_keys:
        # if PK != 'kind_type.id':
        #   continue
        group_data = {}
        group_sample_rate = {}
        for K in equivalent_keys[PK]:
            group_data[K] = sampled_data[K]
            group_sample_rate[K] = sample_rate[K]
        _, optimal_bucket = sub_optimal_bucketize(group_data, group_sample_rate, n_bins=n_bins[PK], primary_keys=primary_keys)
        optimal_buckets[PK] = optimal_bucket
        for K in equivalent_keys[PK]:
            temp_table_name = K.split(".")[0]
            if temp_table_name not in bin_size:
                bin_size[temp_table_name] = dict()
                all_bin_modes[temp_table_name] = dict()
            bin_size[temp_table_name][K] = len(optimal_bucket.bins)
            all_bin_modes[temp_table_name][K] = optimal_bucket.buckets[K].bin_modes

    table_buckets = dict()
    for table_name in bin_size:
        table_buckets[table_name] = Table_bucket(table_name, list(bin_size[table_name].keys()), bin_size[table_name],
                                                 all_bin_modes[table_name])
    
    all_bins = dict()
    for key in optimal_buckets:
        all_bins[key] = optimal_buckets[key].bins

    ground_truth_factors_no_filter = get_ground_truth_no_filter(equivalent_keys, data, all_bins, table_lens, na_values)

    if save_bucket_bins:
        with open(model_folder + f"/imdb_buckets.pkl") as f:
            pickle.dump(optimal_buckets, f, pickle.HIGHEST_PROTOCOL)

    return schema, table_buckets, ground_truth_factors_no_filter


In [3]:
n_bins = {
    'title.id': 800,
    'info_type.id': 100,
    'keyword.id': 100,
    'company_name.id': 100,
    'name.id': 100,
    'company_type.id': 100,
    'comp_cast_type.id': 50,
    'kind_type.id': 50,
    'char_name.id': 50,
    'role_type.id': 50,
    'link_type.id': 50
}
data_path = "/home/ubuntu/data_CE/imdb/{}.csv"
model_folder = "/home/ubuntu/data_CE/CE_scheme_models/"
schema, table_buckets, ground_truth_factors_no_filter = process_imdb_data(data_path, model_folder, n_bins,
                                                                              False)

  if (await self.run_code(code, result,  async_=asy)):


KeyboardInterrupt: 

In [31]:
import numpy as np
import copy

from Join_scheme.join_graph import get_join_hyper_graph, parse_query_all_join
from Join_scheme.data_prepare import identify_key_values
#from Sampling.load_sample import load_sample_imdb_one_query


class Factor:
    """
    This the class defines a multidimensional conditional probability on one table.
    """

    def __init__(self, table, table_len, variables, pdfs, na_values=None):
        self.table = table
        self.table_len = table_len
        self.variables = variables
        self.pdfs = pdfs
        self.na_values = na_values  # this is the percentage of data, which is not nan.


class Group_Factor:
    """
        This the class defines a multidimensional conditional probability on a group of tables.
    """

    def __init__(self, tables, tables_size, variables, pdfs, bin_modes, equivalent_groups=None, 
                 table_key_equivalent_group=None, na_values=None, join_cond=None):
        self.table = tables
        self.tables_size = tables_size
        self.variables = variables
        self.pdfs = pdfs
        self.bin_modes = bin_modes
        self.equivalent_groups = equivalent_groups
        self.table_key_equivalent_group = table_key_equivalent_group
        self.na_values = na_values
        self.join_cond = join_cond


class Bound_ensemble:
    """
    This the class where we store all the trained models and perform inference on the bound.
    """

    def __init__(self, table_buckets, schema, ground_truth_factors_no_filter=None):
        self.table_buckets = table_buckets
        self.schema = schema
        self.all_keys, self.equivalent_keys = identify_key_values(schema)
        self.all_join_conds = None
        self.ground_truth_factors_no_filter = ground_truth_factors_no_filter
        # self.reverse_table_alias = None

    def parse_query_simple(self, query):
        """
        If your selection query contains no aggregation and nested sub-queries, you can use this function to parse a
        join query. Otherwise, use parse_query function.
        """
        tables_all, join_cond, join_keys = parse_query_all_join(query)
        # TODO: implement functions on parsing filter conditions.
        table_filters = dict()
        return tables_all, table_filters, join_cond, join_keys

    def get_all_id_conidtional_distribution(self, query_file_name, tables_alias, join_keys):
        # TODO: make it work on query-driven and sampling based
        return load_sample_imdb_one_query(self.table_buckets, tables_alias, query_file_name, join_keys,
                                          self.ground_truth_factors_no_filter)

    def eliminate_one_key_group(self, tables, key_group, factors, relevant_keys):
        """This version only supports 2D distributions (i.e. the distribution learned with tree-structured PGM)"""
        rest_group = None
        rest_group_cardinalty = 0
        eliminated_tables = []
        rest_group_tables = []
        for table in tables:
            assert key_group in factors[table].equivalent_variables
            temp = copy.deepcopy(factors[table].equivalent_variables)
            temp.remove(key_group)
            if len(temp) == 0:
                eliminated_tables.append(table)
            for key in temp:
                if rest_group:
                    assert factors[table].cardinalities[key] == rest_group_cardinalty
                    rest_group_tables.append(table)
                else:
                    rest_group = key
                    rest_group_cardinalty = factors[table].cardinalities[key]
                    rest_group_tables = [table]

        all_probs_eliminated = []
        all_modes_eliminated = []
        for table in eliminated_tables:
            bin_modes = self.table_buckets[table].oned_bin_modes[relevant_keys[key_group][table]]
            all_probs_eliminated.append(factors[table].pdfs)
            all_modes_eliminated.append(np.minimum(bin_modes, factors[table].pdfs))

        if rest_group:
            new_factor_pdf = np.zeros(rest_group_cardinalty)
        else:
            return self.compute_bound_oned(all_probs_eliminated, all_modes_eliminated)

        for i in range(rest_group_cardinalty):
            for table in rest_group_tables:
                idx_f = factors[table].equivalent_variables.index(key_group)
                idx_b = self.table_buckets[table].id_attributes.index(relevant_keys[key_group][table])
                bin_modes = self.table_buckets[table].twod_bin_modes[relevant_keys[key_group][table]]
                if idx_f == 0 and idx_b == 0:
                    all_probs_eliminated.append(factors[table].pdfs[:, i])
                    all_modes_eliminated.append(np.minimum(bin_modes[:, i], factors[table].pdfs[:, i]))
                elif idx_f == 0 and idx_b == 1:
                    all_probs_eliminated.append(factors[table].pdfs[:, i])
                    all_modes_eliminated.append(np.minimum(bin_modes[i, :], factors[table].pdfs[:, i]))
                elif idx_f == 1 and idx_b == 0:
                    all_probs_eliminated.append(factors[table].pdfs[i, :])
                    all_modes_eliminated.append(np.minimum(bin_modes[:, i], factors[table].pdfs[i, :]))
                else:
                    all_probs_eliminated.append(factors[table].pdfs[i, :])
                    all_modes_eliminated.append(np.minimum(bin_modes[i, :], factors[table].pdfs[i, :]))
            new_factor_pdf[i] = self.compute_bound_oned(all_probs_eliminated, all_modes_eliminated)

        for table in rest_group_tables:
            factors[table] = Factor([rest_group], new_factor_pdf, [rest_group])

        return None

    def compute_bound_oned(self, all_probs, all_modes, return_factor=False):
        temp_all_modes = []
        for i in range(len(all_modes)):
            temp_all_modes.append(np.minimum(all_probs[i], all_modes[i]))
        all_probs = np.stack(all_probs, axis=0)
        temp_all_modes = np.stack(temp_all_modes, axis=0)
        multiplier = np.prod(temp_all_modes, axis=0)
        non_zero_idx = np.where(multiplier != 0)[0]
        min_number = np.amin(all_probs[:, non_zero_idx] / temp_all_modes[:, non_zero_idx], axis=0)
        # print(min_number, multiplier[non_zero_idx])
        if return_factor:
            new_probs = np.zeros(multiplier.shape)
            new_probs[non_zero_idx] = multiplier[non_zero_idx] * min_number
            return new_probs, multiplier
        else:
            multiplier[non_zero_idx] = multiplier[non_zero_idx] * min_number
            return np.sum(multiplier)

    def get_optimal_elimination_order(self, equivalent_group, join_keys, factors):
        """
        This function determines the optimial elimination order for each key group
        """
        cardinalities = dict()
        lengths = dict()
        tables_involved = dict()
        relevant_keys = dict()
        for group in equivalent_group:
            relevant_keys[group] = dict()
            lengths[group] = len(equivalent_group[group])
            cardinalities[group] = []
            tables_involved[group] = set([])
            for keys in equivalent_group[group]:
                for table in join_keys:
                    if keys in join_keys[table]:
                        cardinalities[group].append(len(join_keys[table]))
                        tables_involved[group].add(table)
                        variables = factors[table].variables
                        variables[variables.index(keys)] = group
                        factors[table].variables = variables
                        relevant_keys[group][table] = keys
                        break
            cardinalities[group] = np.asarray(cardinalities[group])

        optimal_order = list(equivalent_group.keys())
        for i in range(len(optimal_order)):
            min_idx = i
            for j in range(i + 1, len(optimal_order)):
                min_group = optimal_order[min_idx]
                curr_group = optimal_order[j]
                if np.max(cardinalities[curr_group]) < np.max(cardinalities[min_group]):
                    min_idx = j
                else:
                    min_max_tables = np.max(cardinalities[min_group])
                    min_num_max_tables = len(np.where(cardinalities[min_group] == min_max_tables)[0])
                    curr_max_tables = np.max(cardinalities[curr_group])
                    curr_num_max_tables = len(np.where(cardinalities[curr_group] == curr_max_tables)[0])
                    if curr_num_max_tables < min_num_max_tables:
                        min_idx = j
                    elif lengths[curr_group] < lengths[min_group]:
                        min_idx = j
            optimal_order[i], optimal_order[min_idx] = optimal_order[min_idx], optimal_order[i]
        return optimal_order, tables_involved, relevant_keys

    def get_cardinality_bound_one(self, query_str):
        tables_all, table_queries, join_cond, join_keys = self.parse_query_simple(query_str)
        equivalent_group = get_join_hyper_graph(join_keys, self.equivalent_keys)
        conditional_factors = self.get_all_id_conidtional_distribution(table_queries, join_keys, equivalent_group)
        optimal_order, tables_involved, relevant_keys = self.get_optimal_elimination_order(equivalent_group, join_keys,
                                                                                           conditional_factors)

        for key_group in optimal_order:
            tables = tables_involved[key_group]
            res = self.eliminate_one_key_group(tables, key_group, conditional_factors, relevant_keys)
        return res

    def get_sub_plan_queries_sql(self, query_str, sub_plan_query_str_all, query_name=None):
        tables_all, table_queries, join_cond, join_keys = self.parse_query_simple(query_str)
        equivalent_group, table_equivalent_group, table_key_equivalent_group = get_join_hyper_graph(join_keys,
                                                                                                    self.equivalent_keys)
        cached_sub_queries_sql = dict()
        cached_union_key_group = dict()
        res_sql = []
        for (left_tables, right_tables) in sub_plan_query_str_all:
            assert " " not in left_tables, f"{left_tables} contains more than one tables, violating left deep plan"
            sub_plan_query_list = right_tables.split(" ") + [left_tables]
            sub_plan_query_list.sort()
            sub_plan_query_str = " ".join(sub_plan_query_list)  # get the string name of the sub plan query
            sql_header = "SELECT COUNT(*) FROM "
            for alias in sub_plan_query_list:
                sql_header += (tables_all[alias] + " AS " + alias + ", ")
            sql_header = sql_header[:-2] + " WHERE "
            if " " in right_tables:
                assert right_tables in cached_sub_queries_sql, f"{right_tables} not in cache, input is not ordered"
                right_sql = cached_sub_queries_sql[right_tables]
                right_union_key_group = cached_union_key_group[right_tables]
                if left_tables in table_queries:
                    left_sql = table_queries[left_tables]
                    curr_sql = right_sql + " AND (" + left_sql + ")"
                else:
                    curr_sql = right_sql
                additional_joins, union_key_group = self.get_additional_join_with_table_group(left_tables,
                                                                                              right_union_key_group,
                                                                                              table_equivalent_group,
                                                                                              table_key_equivalent_group)
                for join in additional_joins:
                    curr_sql = curr_sql + " AND " + join
            else:
                curr_sql = ""
                if left_tables in table_queries:
                    curr_sql += ("(" + table_queries[left_tables] + ")")
                if right_tables in table_queries:
                    if curr_sql != "":
                        curr_sql += " AND "
                    curr_sql += ("(" + table_queries[right_tables] + ")")

                additional_joins, union_key_group = self.get_additional_joins_two_tables(left_tables, right_tables,
                                                                                         table_equivalent_group,
                                                                                         table_key_equivalent_group)
                for join in additional_joins:
                    if curr_sql == "":
                        curr_sql += join
                    else:
                        curr_sql = curr_sql + " AND " + join
            cached_sub_queries_sql[sub_plan_query_str] = curr_sql
            cached_union_key_group[sub_plan_query_str] = union_key_group
            res_sql.append(sql_header + curr_sql + ";")
        return res_sql

    def get_cardinality_bound_all(self, query_str, sub_plan_query_str_all, query_name=None, debug=False,
                                  true_card=None):
        """
        Get the cardinality bounds for all sub_plan_queires of a query.
        Note: Due to efficiency, this current version only support left_deep plans (like the one generated by postgres),
              but it can easily support right deep or bushy plans.
        :param query_str: the target query
        :param sub_plan_query_str_all: all sub_plan_queries of the target query,
               it should be sorted by number of the tables in the sub_plan_query
        """
        tables_all, table_queries, join_cond, join_keys = self.parse_query_simple(query_str)
        #print(join_cond)
        # print(join_keys)
        equivalent_group, table_equivalent_group, table_key_equivalent_group, table_key_group_map = \
            get_join_hyper_graph(join_keys, self.equivalent_keys)
        conditional_factors = self.get_all_id_conidtional_distribution(query_name, tables_all, join_keys)
        # self.reverse_table_alias = {v: k for k, v in tables_all.items()}
        cached_sub_queries = dict()
        cardinality_bounds = []
        for i, (left_tables, right_tables) in enumerate(sub_plan_query_str_all):
            assert " " not in left_tables, f"{left_tables} contains more than one tables, violating left deep plan"
            sub_plan_query_list = right_tables.split(" ") + [left_tables]
            sub_plan_query_list.sort()
            sub_plan_query_str = " ".join(sub_plan_query_list)  # get the string name of the sub plan query
            # print(sub_plan_query_str)
            #print(sub_plan_query_str, "=========================================")
            if " " in right_tables:
                assert right_tables in cached_sub_queries, f"{right_tables} not in cache, input is not ordered"
                right_bound_factor = cached_sub_queries[right_tables]
                curr_bound_factor, res = self.join_with_one_table(sub_plan_query_str,
                                                                  left_tables,
                                                                  tables_all,
                                                                  right_bound_factor,
                                                                  conditional_factors[left_tables],
                                                                  table_equivalent_group,
                                                                  table_key_equivalent_group,
                                                                  table_key_group_map,
                                                                  join_cond)
            else:
                curr_bound_factor, res = self.join_two_tables(sub_plan_query_str,
                                                              left_tables,
                                                              right_tables,
                                                              tables_all,
                                                              conditional_factors,
                                                              join_keys,
                                                              table_equivalent_group,
                                                              table_key_equivalent_group,
                                                              table_key_group_map,
                                                              join_cond)
            cached_sub_queries[sub_plan_query_str] = curr_bound_factor
            res = max(res, 1)
            if debug:
                if true_card[i] == -1:
                    error = "NA"
                else:
                    error = max(res / true_card[i], true_card[i] / res)
                #print(f"{left_tables}, {right_tables}|| estimate: {res}, true: {true_card[i]}, error: {error}")
            cardinality_bounds.append(res)
        return cardinality_bounds

    def join_with_one_table(self, sub_plan_query_str, left_table, tables_all, right_bound_factor, cond_factor_left,
                            table_equivalent_group, table_key_equivalent_group, table_key_group_map, join_cond):
        """
        Get the cardinality bound by joining the left_table with the seen right_tables
        :param left_table:
        :param right_tables:
        """
        equivalent_key_group, union_key_group_set, union_key_group, new_join_cond = \
            self.get_join_keys_with_table_group(left_table, right_bound_factor, tables_all, table_equivalent_group,
                                                table_key_equivalent_group, table_key_group_map, join_cond)
        bin_mode_left = self.table_buckets[tables_all[left_table]].oned_bin_modes
        bin_mode_right = right_bound_factor.bin_modes
        key_group_pdf = dict()
        key_group_bin_mode = dict()
        new_union_key_group = dict()
        new_na_values = dict()
        right_variables = right_bound_factor.variables
        new_variables = copy.deepcopy(right_variables)
        res = right_bound_factor.tables_size
        #print("\n")
        #print(union_key_group_set)
        #print(union_key_group)
        for key_group in equivalent_key_group:
            #print(key_group, equivalent_key_group[key_group], res)
            # print(cond_factor_left.na_values)
            # print(right_bound_factor.na_values)
            #print(cond_factor_left.pdfs.keys())
            #print(right_bound_factor.pdfs.keys())
            all_pdfs = [cond_factor_left.pdfs[key] * cond_factor_left.table_len * cond_factor_left.na_values[key]
                        for key in equivalent_key_group[key_group]["left"]]
            all_bin_modes = [bin_mode_left[key] for key in equivalent_key_group[key_group]["left"]]
            for key in equivalent_key_group[key_group]["left"]:
                new_variables[key] = key_group
            for key in equivalent_key_group[key_group]["right"]:
                if key in right_bound_factor.pdfs:
                    new_variables[key] = key_group
                    all_pdfs.append(right_bound_factor.pdfs[key] * res * right_bound_factor.na_values[key])
                    all_bin_modes.append(bin_mode_right[key])
                else:
                    key = right_variables[key]
                    all_pdfs.append(right_bound_factor.pdfs[key] * res * right_bound_factor.na_values[key])
                    all_bin_modes.append(bin_mode_right[key])

            new_pdf, new_bin_mode = self.compute_bound_oned(all_pdfs, all_bin_modes, return_factor=True)
            res = np.sum(new_pdf)
            if res == 0:
                res = 10.0
                new_pdf[-1] = 1
                key_group_pdf[key_group] = new_pdf
            else:
                key_group_pdf[key_group] = new_pdf / res
            key_group_bin_mode[key_group] = new_bin_mode
            new_union_key_group[key_group] = [key_group]
            new_na_values[key_group] = 1
        
        for group in union_key_group:
            if group not in new_union_key_group:
                new_union_key_group[group] = []
            for table, keys in union_key_group[group]:
                for key in keys:
                    new_union_key_group[group].append(key)
                    if table == "left":
                        key_group_pdf[key] = cond_factor_left.pdfs[key]
                        key_group_bin_mode[key] = self.table_buckets[tables_all[left_table]].oned_bin_modes[key]
                        new_na_values[key] = cond_factor_left.na_values[key]
                    else:
                        key_group_pdf[key] = right_bound_factor.pdfs[key]
                        key_group_bin_mode[key] = right_bound_factor.bin_modes[key]
                        new_na_values[key] = right_bound_factor.na_values[key]
        
        #print("****", key_group_pdf.keys())
        new_factor = Group_Factor(sub_plan_query_str, res, new_variables, key_group_pdf, key_group_bin_mode,
                                  union_key_group_set, new_union_key_group, new_na_values, new_join_cond)
        return new_factor, res

    def get_join_keys_with_table_group(self, left_table, right_bound_factor, tables_all, table_equivalent_group,
                                       table_key_equivalent_group, table_key_group_map, join_cond):
        """
            Get the join keys between two tables
        """

        actual_join_cond = []
        for cond in join_cond[left_table]:
            if cond in right_bound_factor.join_cond:
                actual_join_cond.append(cond)
        #print(join_cond[left_table], right_bound_factor.join_cond)
        #print(actual_join_cond)
        equivalent_key_group = dict()
        union_key_group_set = table_equivalent_group[left_table].union(right_bound_factor.equivalent_groups)
        union_key_group = dict()
        new_join_cond = right_bound_factor.join_cond.union(join_cond[left_table])
        if len(actual_join_cond) != 0:
            for cond in actual_join_cond:
                key1 = cond.split("=")[0].strip()
                key2 = cond.split("=")[1].strip()
                if key1.split(".")[0] == left_table:
                    key_left = tables_all[left_table] + "." + key1.split(".")[-1]
                    key_group = table_key_group_map[left_table][key_left]
                    if key_group not in equivalent_key_group:
                        equivalent_key_group[key_group] = dict()
                    if left_table in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group]["left"].append(key_left)
                    else:
                        equivalent_key_group[key_group]["left"] = [key_left]
                    right_table = key2.split(".")[0]
                    key_right = tables_all[right_table] + "." + key2.split(".")[-1]
                    key_group_t = table_key_group_map[right_table][key_right]
                    assert key_group_t == key_group, f"key group mismatch for join {cond}"
                    if "right" in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group]["right"].append(key_right)
                    else:
                        equivalent_key_group[key_group]["right"] = [key_right]
                else:
                    assert key2.split(".")[0] == left_table, f"unrecognized table alias"
                    key_left = tables_all[left_table] + "." + key2.split(".")[-1]
                    key_group = table_key_group_map[left_table][key_left]
                    if key_group not in equivalent_key_group:
                        equivalent_key_group[key_group] = dict()
                    if left_table in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group]["left"].append(key_left)
                    else:
                        equivalent_key_group[key_group]["left"] = [key_left]
                    right_table = key1.split(".")[0]
                    key_right = tables_all[right_table] + "." + key1.split(".")[-1]
                    key_group_t = table_key_group_map[right_table][key_right]
                    assert key_group_t == key_group, f"key group mismatch for join {cond}"
                    if "right" in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group]["right"].append(key_right)
                    else:
                        equivalent_key_group[key_group]["right"] = [key_right]
            
            for group in union_key_group_set:
                if group in equivalent_key_group:
                    new_left_key = []
                    for key in table_key_equivalent_group[left_table][group]:
                        if key not in equivalent_key_group[group]["left"]:
                            new_left_key.append(key)
                    if len(new_left_key) != 0:
                        union_key_group[group] = [("left", new_left_key)]
                    new_right_key = []
                    for key in right_bound_factor.table_key_equivalent_group[group]:
                        if key not in equivalent_key_group[group]["right"]:
                            new_right_key.append(key)
                    if len(new_right_key) != 0:
                        if group in union_key_group:
                            union_key_group[group].append(("right", new_right_key))
                        else:
                            union_key_group[group] = [("right", new_right_key)]
                else:
                    if group in table_key_equivalent_group[left_table]:
                        if group in union_key_group:
                            union_key_group[group].append(("left", table_key_equivalent_group[left_table][group]))
                        else:
                            union_key_group[group] = [("left", table_key_equivalent_group[left_table][group])]
                    if group in right_bound_factor.table_key_equivalent_group:
                        if group in union_key_group:
                            union_key_group[group].append(("right", right_bound_factor.table_key_equivalent_group[group]))
                        else:
                            union_key_group[group] = [("right", right_bound_factor.table_key_equivalent_group[group])]
                        
        else:
            common_key_group = table_equivalent_group[left_table].intersection(right_bound_factor.equivalent_groups)
            common_key_group = list(common_key_group)[0]
            for group in union_key_group_set:
                if group == common_key_group:
                    equivalent_key_group[group] = dict()
                    equivalent_key_group[group]["left"] = table_key_equivalent_group[left_table][group]
                    equivalent_key_group[group]["right"] = right_bound_factor.table_key_equivalent_group[group]
                else:
                    if group in table_key_equivalent_group[left_table]:
                        if group in union_key_group:
                            union_key_group[group].append(("left", table_key_equivalent_group[left_table][group]))
                        else:
                            union_key_group[group] = [("left", table_key_equivalent_group[left_table][group])]
                    if group in right_bound_factor.table_key_equivalent_group:
                        if group in union_key_group:
                            union_key_group[group].append(("right", right_bound_factor.table_key_equivalent_group[group]))
                        else:
                            union_key_group[group] = [("right", right_bound_factor.table_key_equivalent_group[group])]

        return equivalent_key_group, union_key_group_set, union_key_group, new_join_cond

    def get_additional_join_with_table_group(self, left_table, right_union_key_group, table_equivalent_group,
                                             table_key_equivalent_group):
        common_key_group = table_equivalent_group[left_table].intersection(set(right_union_key_group.keys()))
        union_key_group_set = table_equivalent_group[left_table].union(set(right_union_key_group.keys()))
        union_key_group = copy.deepcopy(right_union_key_group)
        all_join_predicates = []
        for group in union_key_group_set:
            if group in common_key_group:
                left_key = table_key_equivalent_group[left_table][group][0]
                left_key = left_table + "." + left_key.split(".")[-1]
                right_key = right_union_key_group[group]
                join_predicate = left_key + " = " + right_key
                all_join_predicates.append(join_predicate)
            if group not in union_key_group:
                assert group in table_key_equivalent_group[left_table]
                left_key = table_key_equivalent_group[left_table][group][0]
                left_key = left_table + "." + left_key.split(".")[-1]
                union_key_group[group] = left_key

        return all_join_predicates, union_key_group

    def join_two_tables(self, sub_plan_query_str, left_table, right_table, tables_all, conditional_factors, join_keys,
                        table_equivalent_group, table_key_equivalent_group, table_key_group_map, join_cond):
        """
            Get the cardinality bound by joining the left_table with the right_table
            :param left_table:
            :param right_table:
        """
        equivalent_key_group, union_key_group_set, union_key_group, new_join_cond = \
            self.get_join_keys_two_tables(left_table, right_table, table_equivalent_group, table_key_equivalent_group,
                                          table_key_group_map, join_cond, join_keys, tables_all)
        # print(left_table, right_table)
        # print(equivalent_key_group)
        # print(union_key_group)
        # print(conditional_factors.keys())
        cond_factor_left = conditional_factors[left_table]
        cond_factor_right = conditional_factors[right_table]
        bin_mode_left = self.table_buckets[tables_all[left_table]].oned_bin_modes
        bin_mode_right = self.table_buckets[tables_all[right_table]].oned_bin_modes
        key_group_pdf = dict()
        key_group_bin_mode = dict()
        new_union_key_group = dict()
        res = cond_factor_right.table_len
        new_na_values = dict()
        new_variables = dict()
        # print(equivalent_key_group)
        for key_group in equivalent_key_group:
            # print(key_group)
            # print("========================")
            #print(equivalent_key_group[key_group][left_table])
            #print(bin_mode_left)
            # print("==========================================")
            #print(equivalent_key_group[key_group][right_table])
            #print(bin_mode_right)
            if len(equivalent_key_group[key_group][left_table]) > 1:
                print(len(equivalent_key_group[key_group][left_table]), sub_plan_query_str)
            if len(equivalent_key_group[key_group][right_table]) > 1:
                print(len(equivalent_key_group[key_group][right_table]), sub_plan_query_str)
            all_pdfs = [cond_factor_left.pdfs[key] * cond_factor_left.table_len * cond_factor_left.na_values[key]
                        for key in equivalent_key_group[key_group][left_table]] + \
                       [cond_factor_right.pdfs[key] * res * cond_factor_right.na_values[key]
                        for key in equivalent_key_group[key_group][right_table]]
            all_bin_modes = [bin_mode_left[key] for key in equivalent_key_group[key_group][left_table]] + \
                            [bin_mode_right[key] for key in equivalent_key_group[key_group][right_table]]
            # print("====================================================")
            # print(equivalent_key_group[key_group][left_table])
            # print(equivalent_key_group[key_group][right_table])
            for key in equivalent_key_group[key_group][left_table] + equivalent_key_group[key_group][right_table]:
                new_variables[key] = key_group
            new_pdf, new_bin_mode = self.compute_bound_oned(all_pdfs, all_bin_modes, return_factor=True)
            res = np.sum(new_pdf)
            key_group_pdf[key_group] = new_pdf / res
            key_group_bin_mode[key_group] = new_bin_mode
            new_union_key_group[key_group] = [key_group]
            new_na_values[key_group] = 1.0

        for group in union_key_group:
            if group not in new_union_key_group:
                new_union_key_group[group] = []
            for table, keys in union_key_group[group]:
                for key in keys:
                    new_union_key_group[group].append(key)
                    key_group_pdf[key] = conditional_factors[table].pdfs[key]
                    key_group_bin_mode[key] = self.table_buckets[tables_all[table]].oned_bin_modes[key]
                    new_na_values[key] = conditional_factors[table].na_values[key]
        
        #print("!!!!!!!", union_key_group)
        #print(new_union_key_group)
        new_factor = Group_Factor(sub_plan_query_str, res, new_variables, key_group_pdf, key_group_bin_mode,
                                  union_key_group_set, new_union_key_group, new_na_values, new_join_cond)
        return new_factor, res

    def get_join_keys_two_tables(self, left_table, right_table, table_equivalent_group, table_key_equivalent_group,
                                 table_key_group_map, join_cond, join_keys, tables_all):
        """
            Get the join keys between two tables
        """
        actual_join_cond = []
        for cond in join_cond[left_table]:
            if cond in join_cond[right_table]:
                actual_join_cond.append(cond)
        equivalent_key_group = dict()
        union_key_group_set = table_equivalent_group[left_table].union(table_equivalent_group[right_table])
        union_key_group = dict()
        new_join_cond = join_cond[left_table].union(join_cond[right_table])
        if len(actual_join_cond) != 0:
            for cond in actual_join_cond:
                key1 = cond.split("=")[0].strip()
                key2 = cond.split("=")[1].strip()
                if key1.split(".")[0] == left_table:
                    key_left = tables_all[left_table] + "." + key1.split(".")[-1]
                    key_group = table_key_group_map[left_table][key_left]
                    if key_group not in equivalent_key_group:
                        equivalent_key_group[key_group] = dict()
                    if left_table in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group][left_table].append(key_left)
                    else:
                        equivalent_key_group[key_group][left_table] = [key_left]
                    assert key2.split(".")[0] == right_table, f"unrecognized table alias"
                    key_right = tables_all[right_table] + "." + key2.split(".")[-1]
                    key_group_t = table_key_group_map[right_table][key_right]
                    assert key_group_t == key_group, f"key group mismatch for join {cond}"
                    if right_table in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group][right_table].append(key_right)
                    else:
                        equivalent_key_group[key_group][right_table] = [key_right]
                else:
                    assert key2.split(".")[0] == left_table, f"unrecognized table alias"
                    key_left = tables_all[left_table] + "." + key2.split(".")[-1]
                    key_group = table_key_group_map[left_table][key_left]
                    if key_group not in equivalent_key_group:
                        equivalent_key_group[key_group] = dict()
                    if left_table in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group][left_table].append(key_left)
                    else:
                        equivalent_key_group[key_group][left_table] = [key_left]
                    assert key1.split(".")[0] == right_table, f"unrecognized table alias"
                    key_right = tables_all[right_table] + "." + key1.split(".")[-1]
                    key_group_t = table_key_group_map[right_table][key_right]
                    assert key_group_t == key_group, f"key group mismatch for join {cond}"
                    if right_table in equivalent_key_group[key_group]:
                        equivalent_key_group[key_group][right_table].append(key_right)
                    else:
                        equivalent_key_group[key_group][right_table] = [key_right]

            for group in union_key_group_set:
                if group in equivalent_key_group:
                    new_left_key = []
                    for key in table_key_equivalent_group[left_table][group]:
                        if key not in equivalent_key_group[group][left_table]:
                            new_left_key.append(key)
                    if len(new_left_key) != 0:
                        union_key_group[group] = [(left_table, new_left_key)]
                    new_right_key = []
                    for key in table_key_equivalent_group[right_table][group]:
                        if key not in equivalent_key_group[group][right_table]:
                            new_right_key.append(key)
                    if len(new_right_key) != 0:
                        if group in union_key_group:
                            union_key_group[group].append((right_table, new_right_key))
                        else:
                            union_key_group[group] = [(right_table, new_right_key)]
                else:
                    if group in table_key_equivalent_group[left_table]:
                        if group in union_key_group:
                            union_key_group[group].append((left_table, table_key_equivalent_group[left_table][group]))
                        else:
                            union_key_group[group] = [(left_table, table_key_equivalent_group[left_table][group])]
                    if group in table_key_equivalent_group[right_table]:
                        if group in union_key_group:
                            union_key_group[group].append((right_table, table_key_equivalent_group[right_table][group]))
                        else:
                            union_key_group[group] = [(right_table, table_key_equivalent_group[right_table][group])]

        else:
            common_key_group = table_equivalent_group[left_table].intersection(table_equivalent_group[right_table])
            common_key_group = list(common_key_group)[0]
            for group in union_key_group_set:
                if group == common_key_group:
                    equivalent_key_group[group] = dict()
                    equivalent_key_group[group][left_table] = table_key_equivalent_group[left_table][group]
                    equivalent_key_group[group][right_table] = table_key_equivalent_group[right_table][group]
                elif group in table_key_equivalent_group[left_table]:
                    if group in union_key_group:
                        union_key_group[group].append((left_table, table_key_equivalent_group[left_table][group]))
                    else:
                        union_key_group[group] = [(left_table, table_key_equivalent_group[left_table][group])]
                else:
                    if group in union_key_group:
                        union_key_group[group].append((right_table, table_key_equivalent_group[right_table][group]))
                    else:
                        union_key_group[group] = [(right_table, table_key_equivalent_group[right_table][group])]

        return equivalent_key_group, union_key_group_set, union_key_group, new_join_cond

    def get_additional_joins_two_tables(self, left_table, right_table, table_equivalent_group,
                                        table_key_equivalent_group):
        common_key_group = table_equivalent_group[left_table].intersection(table_equivalent_group[right_table])
        union_key_group_set = table_equivalent_group[left_table].union(table_equivalent_group[right_table])
        union_key_group = dict()
        all_join_predicates = []
        for group in union_key_group_set:
            if group in common_key_group:
                left_key = table_key_equivalent_group[left_table][group][0]
                left_key = left_table + "." + left_key.split(".")[-1]
                right_key = table_key_equivalent_group[right_table][group][0]
                right_key = right_table + "." + right_key.split(".")[-1]
                join_predicate = left_key + " = " + right_key
                all_join_predicates.append(join_predicate)
            if group in table_key_equivalent_group[left_table]:
                left_key = table_key_equivalent_group[left_table][group][0]
                left_key = left_table + "." + left_key.split(".")[-1]
                union_key_group[group] = left_key
            else:
                right_key = table_key_equivalent_group[right_table][group][0]
                right_key = right_table + "." + right_key.split(".")[-1]
                union_key_group[group] = right_key

        return all_join_predicates, union_key_group

    def get_sub_plan_join_key(self, sub_plan_query, join_cond):
        # returning a subset of join_keys covered by the tables in sub_plan_query
        touched_join_cond = set()
        untouched_join_cond = set()
        for tab in join_cond:
            if tab in sub_plan_query:
                touched_join_cond = touched_join_cond.union(join_cond[tab])
            else:
                untouched_join_cond = untouched_join_cond.union(join_cond[tab])
        touched_join_cond -= untouched_join_cond

        join_keys = dict()
        for cond in touched_join_cond:
            key1 = cond.split("=")[0].strip()
            table1 = key1.split(".")[0].strip()
            if table1 not in join_keys:
                join_keys[table1] = set([key1])
            else:
                join_keys[table1].add(key1)

            key2 = cond.split("=")[1].strip()
            table2 = key2.split(".")[0].strip()
            if table2 not in join_keys:
                join_keys[table2] = set([key2])
            else:
                join_keys[table2].add(key2)

        return join_keys

In [32]:
be = Bound_ensemble(table_buckets, schema, ground_truth_factors_no_filter)
model_path = model_folder + f"model_imdb_default.pkl"
pickle.dump(be, open(model_path, 'wb'), pickle.HIGHEST_PROTOCOL)
print(f"models save at {model_path}")

models save at /home/ubuntu/data_CE/CE_scheme_models/model_imdb_default.pkl


In [11]:
query_path = "/home/ubuntu/data_CE/job/"
queries = []
q_file_names = []
for query_no in range(1, 34):
    for suffix in ['a', 'b', 'c', 'd', 'e', 'f', 'g']:
        file = f"{query_no}{suffix}.sql"
        if file in os.listdir(query_path):
            q_file_names.append(file.split(".sql")[0])
            with open(query_path+file, "r") as f:
                q = f.readline()
                queries.append(q)

In [12]:
def identify_key_values(schema):
    """
    identify all the key attributes from the schema of a DB, currently we assume all possible joins are known
    It is also easy to support unseen joins, which we left as a future work.
    :param schema: the schema of a DB
    :return: a dict of all keys, {table: [keys]};
             a dict of set, each indicating which keys on different tables are considered the same key.
    """
    all_keys = set()
    equivalent_keys = dict()
    for i, join in enumerate(schema.relationships):
        keys = join.identifier.split(" = ")
        all_keys.add(keys[0])
        all_keys.add(keys[1])
        seen = False
        for k in equivalent_keys:
            if keys[0] in equivalent_keys[k]:
                equivalent_keys[k].add(keys[1])
                seen = True
                break
            elif keys[1] in equivalent_keys[k]:
                equivalent_keys[k].add(keys[0])
                seen = True
                break
        if not seen:
            # set the keys[-1] as the identifier of this equivalent join key group for convenience.
            equivalent_keys[keys[-1]] = set(keys)

    assert len(all_keys) == sum([len(equivalent_keys[k]) for k in equivalent_keys])
    return all_keys, equivalent_keys

In [13]:
all_keys, equivalent_keys = identify_key_values(schema)
print(equivalent_keys)
print(len(equivalent_keys))
print(len(all_keys))

{'kind_type.id': {'aka_title.kind_id', 'kind_type.id', 'title.kind_id'}, 'info_type.id': {'movie_info_idx.info_type_id', 'info_type.id', 'movie_info.info_type_id', 'person_info.info_type_id'}, 'title.id': {'title.id', 'movie_keyword.movie_id', 'movie_companies.movie_id', 'cast_info.movie_id', 'complete_cast.movie_id', 'aka_title.movie_id', 'movie_link.movie_id', 'movie_info.movie_id', 'movie_info_idx.movie_id', 'movie_link.linked_movie_id'}, 'name.id': {'name.id', 'aka_name.person_id', 'cast_info.person_id', 'person_info.person_id'}, 'char_name.id': {'cast_info.person_role_id', 'char_name.id'}, 'role_type.id': {'cast_info.role_id', 'role_type.id'}, 'comp_cast_type.id': {'comp_cast_type.id', 'complete_cast.subject_id', 'complete_cast.status_id'}, 'link_type.id': {'link_type.id', 'movie_link.link_type_id'}, 'keyword.id': {'keyword.id', 'movie_keyword.keyword_id'}, 'company_name.id': {'company_name.id', 'movie_companies.company_id'}, 'company_type.id': {'company_type.id', 'movie_companies

In [14]:
def count_join_key_appearance(queries, equivalent_keys):
    """
    analyze the workload and count how many times each join key group appears
    """
    all_join_keys_stats = dict()
    total_num_appearance = 0
    for q in queries:
        res = parse_query_all_join(q)
        for table in res[-1]:
            for join_key in list(res[-1][table]):
                for PK in equivalent_keys:
                    if join_key in equivalent_keys[PK]:
                        total_num_appearance += 1
                        if PK in all_join_keys_stats:
                            all_join_keys_stats[PK] += 1
                        else:
                            all_join_keys_stats[PK] = 1
                        break
    return all_join_keys_stats, total_num_appearance
                

In [15]:
all_join_keys_stats, total_num_appearance = count_join_key_appearance(queries, equivalent_keys)

{'company_type.id': 82,
 'title.id': 481,
 'info_type.id': 198,
 'company_name.id': 148,
 'keyword.id': 150,
 'name.id': 138,
 'link_type.id': 36,
 'role_type.id': 40,
 'char_name.id': 44,
 'kind_type.id': 58,
 'comp_cast_type.id': 78}

In [16]:
n_bins = {
    'title.id': 800,
    'info_type.id': 100,
    'keyword.id': 100,
    'company_name.id': 100,
    'name.id': 100,
    'company_type.id': 100,
    'comp_cast_type.id': 50,
    'kind_type.id': 50,
    'char_name.id': 50,
    'role_type.id': 50,
    'link_type.id': 50
}

In [17]:
def make_sample(np_data, nrows=1000000, seed=0):
    np.random.seed(seed)
    samp_data = np_data[np_data != -1]
    if len(samp_data) <= nrows:
        return samp_data, 1.0
    else:
        selected = np.random.choice(len(samp_data), size=nrows, replace=False)
        return samp_data[selected], nrows/len(samp_data)

def stats_analysis(sample, data, sample_rate, show=10):
    n, c = np.unique(sample, return_counts=True)
    idx = np.argsort(c)[::-1]
    for i in range(min(show, len(idx))):
        print(c[idx[i]], c[idx[i]]/sample_rate, len(data[data == n[idx[i]]]))

In [18]:
data = dict()
table_len = dict()
na_values = dict()
sample_rate = dict()
primary_keys = []
for table_obj in schema.tables:
    df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8', quotechar='"',
                          sep=",")
    
    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    df_rows.apply(pd.to_numeric, errors="ignore")
    table_len[table_obj.table_name] = len(df_rows)
    if table_obj.table_name not in na_values:
        na_values[table_obj.table_name] = dict()
    for attr in df_rows.columns:
        if attr in all_keys:
            print(attr)
            print(np.sum(np.isnan(df_rows[attr].values)))
            data[attr] = df_rows[attr].values
            data[attr][np.isnan(data[attr])] = -1
            data[attr][data[attr] < 0] = -1
            na_values[table_obj.table_name][attr] = len(data[attr][data[attr] != -1])/table_len[table_obj.table_name]
            print(len(data[attr]), na_values[table_obj.table_name][attr])
            data[attr] = copy.deepcopy(data[attr])[data[attr]>=0]
            if len(np.unique(data[attr])) >= len(data[attr]) - 10:
                primary_keys.append(attr)
            print(np.sum(np.isnan(df_rows[attr].values)))

title.id
0
2528312 1.0
0
title.kind_id
0
2528312 1.0
0
movie_info_idx.movie_id
0
1380035 1.0
0
movie_info_idx.info_type_id
0
1380035 1.0
0
movie_info.movie_id
0
14835720 1.0
0
movie_info.info_type_id
0
14835720 1.0
0
info_type.id
0
113 1.0
0
cast_info.person_id
0
36244344 1.0
0
cast_info.movie_id
0
36244344 1.0
0
cast_info.person_role_id
18672825
36244344 0.48480720191818066
0
cast_info.role_id
0
36244344 1.0
0
char_name.id
0
3140339 1.0
0
role_type.id
0
12 1.0
0
complete_cast.movie_id
0
135086 1.0
0
complete_cast.subject_id
0
135086 1.0
0
complete_cast.status_id
0
135086 1.0
0
comp_cast_type.id
0
4 1.0
0
name.id
0
4167491 1.0
0
aka_name.person_id
0
901343 1.0
0
movie_link.movie_id
0
29997 1.0
0
movie_link.linked_movie_id
0
29997 1.0
0
movie_link.link_type_id
0
29997 1.0
0
link_type.id
0
18 1.0
0
movie_keyword.movie_id
0
4523930 1.0
0
movie_keyword.keyword_id
0
4523930 1.0
0
keyword.id
0
134170 1.0
0
person_info.person_id
0
2963664 1.0
0
person_info.info_type_id
0
2963664 1.0
0
movie_c

In [19]:
sample_rate = dict()
sampled_data = dict()
for k in data:
    print(k)
    temp = make_sample(data[k], 1000000)
    stats_analysis(temp[0], data[k], temp[1])
    sampled_data[k] = temp[0]
    sample_rate[k] = temp[1]


title.id
1 2.528312 1
1 2.528312 1
1 2.528312 1
1 2.528312 1
1 2.528312 1
1 2.528312 1
1 2.528312 1
1 2.528312 1
1 2.528312 1
1 2.528312 1
title.kind_id
609674 1541446.090288 1543264
262591 663911.976392 662824
46890 118552.54968 118234
39888 100849.309056 100537
35940 90867.53328 90852
5016 12682.012992 12600
1 2.528312 1
movie_info_idx.movie_id
4 5.52014 4
4 5.52014 4
4 5.52014 4
4 5.52014 4
4 5.52014 4
4 5.52014 4
4 5.52014 4
4 5.52014 4
4 5.52014 4
4 5.52014 4
movie_info_idx.info_type_id
333438 460156.11033 459925
333295 459958.765325 459925
333076 459656.53766 459925
185 255.30647499999998 250
6 8.28021 10
movie_info.movie_id
194 2878.12968 2937
118 1750.61496 1584
109 1617.09348 1638
97 1439.06484 1344
92 1364.88624 1281
89 1320.37908 1081
83 1231.36476 1219
82 1216.5290400000001 1160
77 1142.35044 1189
77 1142.35044 1207
movie_info.info_type_id
204500 3033904.74 3036719
103748 1539176.27856 1533909
94821 1406737.80612 1401902
89252 1324117.68144 1325361
87478 1297799.11416 12989

In [20]:
import numpy as np
import copy


class Bucket:
    """
    The class of bucketization of a key attribute
    """

    def __init__(self, name, bins=[], bin_modes=[], bin_vars=[], bin_means=[], rest_bins_remaining=None):
        self.name = name
        self.bins = bins
        self.bin_modes = bin_modes
        self.bin_vars = bin_vars
        self.bin_means = bin_means
        self.rest_bins_remaining = rest_bins_remaining
        if len(bins) != 0:
            assert len(bins) == len(bin_modes)


class Table_bucket:
    """
    The class of bucketization for all key attributes in a table.
    Supporting more than three dimensional bin modes requires simplifying the causal structure
    """
    def __init__(self, table_name, id_attributes, bin_sizes, oned_bin_modes=None):
        self.table_name = table_name
        self.id_attributes = id_attributes
        self.bin_sizes = bin_sizes
        if oned_bin_modes:
            self.oned_bin_modes = oned_bin_modes
        else:
            self.oned_bin_modes = dict()
        self.twod_bin_modes = dict()


class Bucket_group:
    """
    The class of bucketization for a group of equivalent join keys
    """

    def __init__(self, buckets, start_key, sample_rate, bins=None, primary_keys=[]):
        self.buckets = buckets
        self.start_key = start_key
        self.sample_rate = sample_rate
        self.bins = bins
        self.primary_keys = primary_keys

    def bucketize(self, data):
        """
        Discretize data based on the bucket
        """
        res = dict()
        seen_remain_key = np.array([])
        cumulative_bin = copy.deepcopy(self.buckets[self.start_key].bins)
        start_means = np.asarray(self.buckets[self.start_key].bin_means)

        for key in data:
            if key in self.primary_keys:
                continue
            res[key] = copy.deepcopy(data[key])
            if key != self.start_key:
                unique_remain = np.setdiff1d(self.buckets[key].rest_bins_remaining, seen_remain_key)
                assert sum([np.sum(np.isin(unique_remain, b) == 1) for b in cumulative_bin]) == 0

                if len(unique_remain) != 0:
                    remaining_data = data[key][np.isin(data[key], unique_remain)]
                    unique_remain, count_remain = np.unique(remaining_data, return_counts=True)
                    unique_counts = np.unique(count_remain)
                    for u in unique_counts:
                        temp_idx = np.searchsorted(start_means, u)
                        if temp_idx == len(cumulative_bin):
                            idx = -1
                            if u > self.buckets[key].bin_modes[-1]:
                                self.buckets[key].bin_modes[-1] = u
                        elif temp_idx == 0:
                            idx = 0
                        else:
                            if (u - start_means[temp_idx - 1]) >= (start_means[temp_idx] - u):
                                idx = temp_idx - 1
                            else:
                                idx = temp_idx
                        temp_unique = unique_remain[count_remain == u]
                        cumulative_bin[idx] = np.concatenate((cumulative_bin[idx], temp_unique))
                        seen_remain_key = np.concatenate((seen_remain_key, temp_unique))
                        if u > self.buckets[key].bin_modes[idx]:
                            self.buckets[key].bin_modes[idx] = u
            res[key] = copy.deepcopy(data[key])
            count = 0
            for i, b in enumerate(cumulative_bin):
                count += len(data[key][np.isin(data[key], b)])
                res[key][np.isin(data[key], b)] = i

        self.bins = cumulative_bin

        for key in data:
            if key in self.primary_keys:
                res[key] = self.bucketize_PK(data[key])
                self.buckets[key] = Bucket(key, bin_modes=np.ones(len(self.bins)))
        
        for key in data:
            if key in self.primary_keys:
                continue
            if self.sample_rate[key] < 1.0:
                bin_modes = np.asarray(self.buckets[key].bin_modes)
                bin_modes[bin_modes != 1] = bin_modes[bin_modes != 1] / self.sample_rate[key]
                self.buckets[key].bin_modes = bin_modes
        
        return res

    def bucketize_PK(self, data):
        res = copy.deepcopy(data)
        remaining_data = np.unique(data)
        for i, b in enumerate(self.bins):
            res[np.isin(data, b)] = i
            remaining_data = np.setdiff1d(remaining_data, b)
        if len(remaining_data) != 0:
            self.bins.append(list(remaining_data))
            for key in self.buckets:
                if key not in self.primary_keys:
                    self.buckets[key].bin_modes = np.append(self.buckets[key].bin_modes, 0)
        res[np.isin(data, remaining_data)] = len(self.bins)
        return res



def identify_key_values(schema):
    """
    identify all the key attributes from the schema of a DB, currently we assume all possible joins are known
    It is also easy to support unseen joins, which we left as a future work.
    :param schema: the schema of a DB
    :return: a dict of all keys, {table: [keys]};
             a dict of set, each indicating which keys on different tables are considered the same key.
    """
    all_keys = set()
    equivalent_keys = dict()
    for i, join in enumerate(schema.relationships):
        keys = join.identifier.split(" = ")
        all_keys.add(keys[0])
        all_keys.add(keys[1])
        seen = False
        for k in equivalent_keys:
            if keys[0] in equivalent_keys[k]:
                equivalent_keys[k].add(keys[1])
                seen = True
                break
            elif keys[1] in equivalent_keys[k]:
                equivalent_keys[k].add(keys[0])
                seen = True
                break
        if not seen:
            # set the keys[-1] as the identifier of this equivalent join key group for convenience.
            equivalent_keys[keys[-1]] = set(keys)

    assert len(all_keys) == sum([len(equivalent_keys[k]) for k in equivalent_keys])
    return all_keys, equivalent_keys


def equal_freq_binning(name, data, n_bins, data_len):
    uniques, counts = data
    if len(uniques) <= n_bins:
        bins = []
        bin_modes = []
        bin_vars = []
        bin_means = []
        
        for i, uni in enumerate(uniques):
            bins.append([uni])
            bin_modes.append(counts[i])
            bin_vars.append(0)
            bin_means.append(counts[i])
        return Bucket(name, bins, bin_modes, bin_vars, bin_means)
    
    unique_counts, count_counts = np.unique(counts, return_counts=True)
    idx = np.argsort(unique_counts)
    unique_counts = unique_counts[idx]
    count_counts = count_counts[idx]

    bins = []
    bin_modes = []
    bin_vars = []
    bin_means = []

    bin_freq = data_len / n_bins
    cur_freq = 0
    cur_bin = []
    cur_bin_count = []
    for i, uni_c in enumerate(unique_counts):
        cur_freq += count_counts[i] * uni_c
        cur_bin.append(uniques[np.where(counts == uni_c)[0]])
        cur_bin_count.extend([uni_c] * count_counts[i])
        if (cur_freq > bin_freq) or (i == (len(unique_counts) - 1)):
            bins.append(np.concatenate(cur_bin))
            cur_bin_count = np.asarray(cur_bin_count)
            bin_modes.append(uni_c)
            bin_means.append(np.mean(cur_bin_count))
            bin_vars.append(np.var(cur_bin_count))
            cur_freq = 0
            cur_bin = []
            cur_bin_count = []
    assert len(uniques) == sum([len(b) for b in bins]), f"some unique values missed or duplicated"
    return Bucket(name, bins, bin_modes, bin_vars, bin_means)


def compute_variance_score(buckets):
    """
    compute the variance of products of random variables
    """
    all_mean = np.asarray([buckets[k].bin_means for k in buckets])
    all_var = np.asarray([buckets[k].bin_vars for k in buckets])
    return np.sum(np.prod(all_var + all_mean ** 2, axis=0) - np.prod(all_mean, axis=0) ** 2)


def sub_optimal_bucketize(data, sample_rate, n_bins=30, primary_keys=[]):
    """
    Perform sub-optimal bucketization on a group of equivalent join keys.
    :param data: a dict of (potentially sampled) table data of the keys
                 the keys of this dict are one group of equivalent join keys
    :param sample_rate: the sampling rate the data, could be all 1 if no sampling is performed
    :param n_bins: how many bins can we allocate
    :param primary_keys: the primary keys in the equivalent group since we don't need to bucketize PK.
    :return: new data, where the keys are bucketized
             the mode of each bucket
    """
    unique_values = dict()
    for key in data:
        if key not in primary_keys:
            unique_values[key] = np.unique(data[key], return_counts=True)

    best_variance_score = np.infty
    best_bin_len = 0
    best_start_key = None
    best_buckets = None
    for start_key in data:
        if start_key in primary_keys:
            continue
        start_bucket = equal_freq_binning(start_key, unique_values[start_key], n_bins, len(data[start_key]))
        rest_buckets = dict()
        for key in data:
            if key == start_key or key in primary_keys:
                continue
            uniques = unique_values[key][0]
            counts = unique_values[key][1]
            rest_buckets[key] = Bucket(key, [], [0] * len(start_bucket.bins), [0] * len(start_bucket.bins),
                                       [0] * len(start_bucket.bins), uniques)
            for i, bin in enumerate(start_bucket.bins):
                idx = np.where(np.isin(uniques, bin) == 1)[0]
                if len(idx) != 0:
                    bin_count = counts[idx]
                    unique_bin_keys = uniques[idx]
                    # unique_bin_count = np.unique(bin_count)
                    # bin_count = np.concatenate([counts[counts == j] for j in unique_bin_count])
                    # unique_bin_keys = np.concatenate([uniques[counts == j] for j in unique_bin_count])
                    rest_buckets[key].rest_bins_remaining = np.setdiff1d(rest_buckets[key].rest_bins_remaining,
                                                                         unique_bin_keys)
                    rest_buckets[key].bin_modes[i] = np.max(bin_count)
                    rest_buckets[key].bin_vars[i] = np.var(bin_count)
                    rest_buckets[key].bin_means[i] = np.mean(bin_count)

        rest_buckets[start_key] = start_bucket
        var_score = compute_variance_score(rest_buckets)
        if len(start_bucket.bins) > best_bin_len:
            best_variance_score = var_score
            best_start_key = start_key
            best_buckets = rest_buckets
            best_bin_len = len(start_bucket.bins)
        elif len(start_bucket.bins) >= best_bin_len * 0.9 and var_score < best_variance_score:
            best_variance_score = var_score
            best_start_key = start_key
            best_buckets = rest_buckets
            best_bin_len = len(start_bucket.bins)
    
    best_buckets = Bucket_group(best_buckets, best_start_key, sample_rate, primary_keys=primary_keys)
    new_data = best_buckets.bucketize(data)
    return new_data, best_buckets


def fixed_start_key_bucketize(start_key, data, sample_rate, n_bins=30, primary_keys=[]):
    """
    Perform sub-optimal bucketization on a group of equivalent join keys based on the pre-defined start_key.
    :param data: a dict of (potentially sampled) table data of the keys
                 the keys of this dict are one group of equivalent join keys
    :param sample_rate: the sampling rate the data, could be all 1 if no sampling is performed
    :param n_bins: how many bins can we allocate
    :param primary_keys: the primary keys in the equivalent group since we don't need to bucketize PK.
    :return: new data, where the keys are bucketized
             the mode of each bucket
    """
    unique_values = dict()
    for key in data:
        if key not in primary_keys:
            unique_values[key] = np.unique(data[key], return_counts=True)

    start_bucket = equal_freq_binning(start_key, unique_values[start_key], n_bins, len(data[start_key]))
    rest_buckets = dict()
    for key in data:
        if key == start_key or key in primary_keys:
            continue
        uniques = unique_values[key][0]
        counts = unique_values[key][1]
        rest_buckets[key] = Bucket(key, [], [0] * len(start_bucket.bins), [0] * len(start_bucket.bins),
                                   [0] * len(start_bucket.bins), uniques)
        for i, bin in enumerate(start_bucket.bins):
            idx = np.where(np.isin(uniques, bin) == 1)[0]
            if len(idx) != 0:
                bin_count = counts[idx]
                unique_bin_keys = uniques[idx]
                rest_buckets[key].rest_bins_remaining = np.setdiff1d(rest_buckets[key].rest_bins_remaining,
                                                                     unique_bin_keys)
                rest_buckets[key].bin_modes[i] = np.max(bin_count)
                rest_buckets[key].bin_means[i] = np.mean(bin_count)

    best_buckets = Bucket_group(rest_buckets, start_key, sample_rate, primary_keys=primary_keys)
    new_data = best_buckets.bucketize(data)
    return new_data, best_buckets



In [21]:
schema = gen_imdb_schema(data_path)
all_keys, equivalent_keys = identify_key_values(schema)
data = dict()
primary_keys = []
for table_obj in schema.tables:
    df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8',
                          quotechar='"',
                          sep=",")

    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    df_rows.apply(pd.to_numeric, errors="ignore")
    for attr in df_rows.columns:
        if attr in all_keys:
            data[attr] = df_rows[attr].values
            data[attr][np.isnan(data[attr])] = -1
            data[attr][data[attr] < 0] = -1
            data[attr] = copy.deepcopy(data[attr])[data[attr] >= 0]
            if len(np.unique(data[attr])) >= len(data[attr]) - 10:
                primary_keys.append(attr)

sample_rate = dict()
sampled_data = dict()
for k in data:
    temp = make_sample(data[k], 1000000)
    sampled_data[k] = temp[0]
    sample_rate[k] = temp[1]

optimal_buckets = dict()
bin_size = dict()
all_bin_modes = dict()
for PK in equivalent_keys:
    # if PK != 'kind_type.id':
    #   continue
    group_data = {}
    group_sample_rate = {}
    for K in equivalent_keys[PK]:
        group_data[K] = sampled_data[K]
        group_sample_rate[K] = sample_rate[K]
    _, optimal_bucket = sub_optimal_bucketize(group_data, group_sample_rate, n_bins=n_bins[PK], primary_keys=primary_keys)
    optimal_buckets[PK] = optimal_bucket
    for K in equivalent_keys[PK]:
        temp_table_name = K.split(".")[0]
        if temp_table_name not in bin_size:
            bin_size[temp_table_name] = dict()
            all_bin_modes[temp_table_name] = dict()
        bin_size[temp_table_name][K] = len(optimal_bucket.bins)
        all_bin_modes[temp_table_name][K] = optimal_bucket.buckets[K].bin_modes

table_buckets = dict()
for table_name in bin_size:
    table_buckets[table_name] = Table_bucket(table_name, list(bin_size[table_name].keys()), bin_size[table_name],
                                             all_bin_modes[table_name])

In [22]:
temp_bins = dict()
for key in optimal_buckets:
    temp_bins[key] = optimal_buckets[key].bins

In [23]:
temp_bins.keys()

dict_keys(['kind_type.id', 'info_type.id', 'title.id', 'name.id', 'char_name.id', 'role_type.id', 'comp_cast_type.id', 'link_type.id', 'keyword.id', 'company_name.id', 'company_type.id'])

In [35]:
with open("table_buckets.pkl", "rb") as f:
    old_table_buckets = pickle.load(f)

In [37]:
new_table_buckets = dict()
for key in table_buckets:
    if key in old_table_buckets:
        new_table_buckets[key] = old_table_buckets[key]
    else:
        new_table_buckets[key] = table_buckets[key]

In [36]:
old_table_buckets

{'title': <__main__.Table_bucket at 0x7fc0923dda50>,
 'aka_title': <__main__.Table_bucket at 0x7fc09d1a4b50>,
 'kind_type': <__main__.Table_bucket at 0x7fc09d1a42d0>,
 'info_type': <__main__.Table_bucket at 0x7fc09d1a4b10>,
 'movie_info': <__main__.Table_bucket at 0x7fc09d1a45d0>,
 'movie_info_idx': <__main__.Table_bucket at 0x7fc09d1a4790>,
 'person_info': <__main__.Table_bucket at 0x7fc09d1a4950>,
 'movie_companies': <__main__.Table_bucket at 0x7fc09d1a4c10>,
 'movie_keyword': <__main__.Table_bucket at 0x7fc09c7fd890>,
 'cast_info': <__main__.Table_bucket at 0x7fc09c7fd050>,
 'complete_cast': <__main__.Table_bucket at 0x7fc09c7fdd50>,
 'name': <__main__.Table_bucket at 0x7fc09c7fd350>,
 'aka_name': <__main__.Table_bucket at 0x7fc09c7fd2d0>,
 'char_name': <__main__.Table_bucket at 0x7fc09c7fd290>,
 'role_type': <__main__.Table_bucket at 0x7fc09c7fdb90>,
 'comp_cast_type': <__main__.Table_bucket at 0x7fc09c7fd0d0>,
 'keyword': <__main__.Table_bucket at 0x7fc09c7fd110>,
 'company_name':

In [59]:
def apply_binning_to_data_value_count(bins, data):
    res = np.zeros(len(bins))
    bin_mode = np.zeros(len(bins))
    unique_remain = np.unique(data)
    for i, bin in enumerate(bins):
        data_bin = data[np.isin(data, bin)]
        res[i] = np.sum(np.isin(data, bin))
        unique_remain = np.setdiff1d(unique_remain, bin)
        _, counts = np.unique(data_bin, return_counts=True)
        if len(counts) == 0:
            bin_mode[i] = 0
        else:
            bin_mode[i] = np.max(counts)
    res[0] += np.sum(np.isin(data, unique_remain))
    return bin_mode

In [65]:
t2 = apply_binning_to_data_value_count(bins['title.id'], data['movie_link.linked_movie_id'])

In [67]:
len(t2)

74

In [68]:
new_table_buckets['movie_link'].oned_bin_modes['movie_link.linked_movie_id'] = t2

In [69]:
new_table_buckets['movie_link'].oned_bin_modes

{'movie_link.linked_movie_id': array([ 30.,  33.,  23.,  26.,  23.,  28.,  28.,  72.,  12.,  30.,  17.,
         19.,  26.,  23.,  16.,  26.,  22.,  39.,  21.,  16.,  15.,  34.,
         57.,  15.,  23.,  18.,  36.,  19.,  12.,  35.,  39.,  13.,  19.,
         18.,  37.,  22.,  14.,  14.,  33.,  27., 131.,   8.,  29.,  64.,
         15.,  23.,   5.,  15.,  25.,   7.,  14.,  29.,   9.,  50.,   4.,
         11.,   6.,   6.,   3.,   5.,  16.,  14.,  11.,  20.,  14.,  15.,
          4.,  27.,  19.,  37.,   6.,  12.,   3.,  19.]),
 'movie_link.movie_id': array([267., 517., 117., 161., 276., 146., 143.,  46., 170.,  50.,  32.,
         44.,  40.,  36.,  11., 569.,  67.,  81.,  11., 196.,   8.,   0.,
          2.,  21.,   4.,   0.,   0.,   1., 101.,   0.,   0.,  42.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,  42.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
 

In [28]:
def apply_binning_to_data_value_count(bins, data):
    res = np.zeros(len(bins))
    unique_remain = np.unique(data)
    for i, bin in enumerate(bins):
        res[i] = np.sum(np.isin(data, bin))
        unique_remain = np.setdiff1d(unique_remain, bin)

    res[0] += np.sum(np.isin(data, unique_remain))
    return res

class Factor:
    """
    This the class defines a multidimensional conditional probability on one table.
    """
    def __init__(self, table, table_len, variables, pdfs, na_values=None):
        self.table = table
        self.table_len = table_len
        self.variables = variables
        self.pdfs = pdfs
        self.na_values = na_values  # this is the percentage of data, which is not nan.

all_factor_pdfs = dict()
for PK in equivalent_keys:
    if PK in bins:
        bin_value = bins[PK]
    else:
        bin_value = temp_bins[PK]
    for key in equivalent_keys[PK]:
        table = key.split(".")[0]
        print(table, PK)
        temp = apply_binning_to_data_value_count(bin_value, data[key])
        print(np.sum(temp))
        print(temp)
        if table not in all_factor_pdfs:
            all_factor_pdfs[table] = dict()
        all_factor_pdfs[table][key] = temp / np.sum(temp)

title kind_type.id
2528312.0
[6.628240e+05 9.085200e+04 1.005370e+05 1.182340e+05 1.260000e+04
 1.543264e+06 1.000000e+00 0.000000e+00]
kind_type kind_type.id
7.0
[1. 1. 1. 1. 1. 1. 0. 1.]
aka_title kind_type.id
361472.0
[293275.  20320.  26939.  13497.   4565.   2876.      0.      0.]
person_info info_type.id
2963664.0
[      0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.   75258.       0.  620526.       0.
   49990.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       0.       0.       0.   12109.
   40462.   10589.       0.       0.       0.       0. 2154730.       0.]
info_type info_type.

2528312.0
[1.881396e+06 1.312220e+05 3.459500e+04 1.693000e+04 1.052400e+04
 7.475000e+03 5.756000e+03 4.737000e+03 3.935000e+03 3.183000e+03
 2.811000e+03 2.285000e+03 1.802000e+03 1.481000e+03 1.161000e+03
 1.024000e+03 7.940000e+02 7.210000e+02 6.270000e+02 5.510000e+02
 4.690000e+02 4.670000e+02 3.700000e+02 3.530000e+02 3.590000e+02
 3.150000e+02 2.990000e+02 2.560000e+02 2.470000e+02 2.120000e+02
 1.960000e+02 2.020000e+02 1.610000e+02 1.750000e+02 1.570000e+02
 1.390000e+02 1.490000e+02 1.310000e+02 1.190000e+02 9.900000e+01
 9.800000e+01 9.300000e+01 9.400000e+01 8.400000e+01 8.100000e+01
 8.800000e+01 7.400000e+01 5.500000e+01 6.100000e+01 4.900000e+01
 4.900000e+01 7.200000e+01 3.900000e+01 5.800000e+01 3.500000e+01
 5.200000e+01 3.000000e+01 3.500000e+01 2.700000e+01 3.100000e+01
 2.600000e+01 2.500000e+01 3.600000e+01 3.200000e+01 2.600000e+01
 2.500000e+01 2.200000e+01 2.600000e+01 2.300000e+01 1.700000e+01
 1.800000e+01 1.400000e+01 1.200000e+01 4.089200e+05]
movie_keywor

234997.0
[8.0873e+04 2.1082e+04 9.1760e+03 5.3470e+03 3.4680e+03 2.4830e+03
 1.8890e+03 1.4290e+03 1.1650e+03 1.7360e+03 1.3280e+03 9.6800e+02
 7.7600e+02 6.2100e+02 7.0000e+02 6.1300e+02 4.5700e+02 4.0400e+02
 3.3700e+02 2.9200e+02 3.1300e+02 2.6500e+02 2.3000e+02 2.3900e+02
 1.7900e+02 1.8300e+02 1.5400e+02 1.6700e+02 1.5400e+02 1.3500e+02
 1.2300e+02 1.0800e+02 9.4000e+01 9.0000e+01 8.3000e+01 7.5000e+01
 7.0000e+01 6.3000e+01 6.0000e+01 5.3000e+01 5.1000e+01 4.7000e+01
 4.1000e+01 3.9000e+01 3.6000e+01 3.2000e+01 2.9000e+01 2.8000e+01
 2.5000e+01 2.3000e+01 2.0000e+01 1.8000e+01 1.7000e+01 1.5000e+01
 1.4000e+01 1.3000e+01 1.2000e+01 1.1000e+01 9.0000e+00 9.0000e+00
 8.0000e+00 7.0000e+00 7.0000e+00 6.0000e+00 5.0000e+00 5.0000e+00
 4.0000e+00 3.0000e+00 3.0000e+00 3.0000e+00 1.0000e+00 1.0000e+00
 1.0000e+00 1.0000e+00 9.6471e+04]
company_type company_type.id
4.0
[1. 1. 2.]
movie_companies company_type.id
2609129.0
[1274246. 1334883.       0.]


In [30]:
all_factors = dict()
for table in all_factor_pdfs:
    all_factors[table] = Factor(table, table_len[table], list(all_factor_pdfs[table].keys()),
                                all_factor_pdfs[table], na_values[table])

In [32]:
all_factor_pdfs.keys()

dict_keys(['title', 'kind_type', 'aka_title', 'person_info', 'info_type', 'movie_info_idx', 'movie_info', 'movie_companies', 'cast_info', 'movie_link', 'movie_keyword', 'complete_cast', 'name', 'aka_name', 'char_name', 'role_type', 'comp_cast_type', 'link_type', 'keyword', 'company_name', 'company_type'])

In [70]:
import pickle
with open("new_table_buckets.pkl", "wb") as f:
    pickle.dump(new_table_buckets, f, pickle.HIGHEST_PROTOCOL)
#with open("ground_truth_factors_no_filter.pkl", "wb") as f:
 #   pickle.dump(all_factors, f, pickle.HIGHEST_PROTOCOL)

In [22]:
import pickle
with open("/home/ubuntu/data_CE/saved_models/bins.pkl", "rb") as f:
    bins = pickle.load(f)
#with open("equivalent_keys.pkl", "rb") as f:
 #   equivalent_keys = pickle.load(f)

In [23]:
bins.keys()

dict_keys(['kind_type.id', 'info_type.id', 'title.id', 'name.id', 'char_name.id', 'role_type.id', 'comp_cast_type.id', 'keyword.id', 'company_name.id', 'company_type.id'])

In [24]:
for key in temp_bins:
    print(key, len(temp_bins[key]), len(bins[key]))
    for i in range(len(temp_bins[key])):
        #print(key, len(temp_bins[key][i]), len(bins[key][i]))
        if np.all(temp_bins[key][i] == bins[key][i]):
            print(key, len(temp_bins[key][i]), len(bins[key][i]), "okay")
        else:
            print(key, len(temp_bins[key][i]), len(bins[key][i]), "notokay")
        #assert np.all(temp_bins[key][i] == bins[key][i])

kind_type.id 8 8
kind_type.id 1 1 okay
kind_type.id 1 1 okay
kind_type.id 1 1 okay
kind_type.id 1 1 okay
kind_type.id 1 1 okay
kind_type.id 1 1 okay
kind_type.id 1 1 okay
kind_type.id 1 1 okay
info_type.id 72 72
info_type.id 3 3 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 okay
info_type.id 1 1 o

  """


IndexError: list index out of range

In [None]:
equivalent_keys

In [None]:
[len(temp_bins[k]) for k in temp_bins]

In [None]:
bucket = temp['company_type.id']
for k in bucket.buckets:
    print("==============================================================")
    print(k, len(bucket.buckets[k].bin_modes), len(bucket.bins))
    print(bucket.buckets[k].bin_modes)
    print([len(b) for b in bucket.buckets[k].bins])
    print(bucket.buckets[k].bin_means)
    print(bucket.buckets[k].bin_vars)

In [None]:
data_path = "/Users/ziniuw/Desktop/past_research/End-to-End-CardEst-Benchmark/datasets/stats_simplified/{}.csv"
model_folder = "../../CE_scheme_models"
data, null_values, key_attrs, table_buckets, equivalent_keys, schema, bin_size = process_stats_data(data_path,
                                                                       model_folder, 200, False)

In [None]:
table = "votes"
bucket = table_buckets[table]
for attr in bucket.id_attributes:
    print(attr)
    print(np.sum(bucket.oned_bin_modes[attr]), bucket.oned_bin_modes[attr].shape)
    if len(bucket.twod_bin_modes[attr]) != 0:
        print(np.sum(bucket.twod_bin_modes[attr]), bucket.twod_bin_modes[attr].shape)

In [None]:
table = "postLinks"
bucket = table_buckets[table]
for attr in bucket.id_attributes:
    print(attr)
    print(np.sum(bucket.oned_bin_modes[attr]), bucket.oned_bin_modes[attr].shape)
    if len(bucket.twod_bin_modes[attr]) != 0:
        print(np.sum(bucket.twod_bin_modes[attr]), bucket.twod_bin_modes[attr].shape)

In [None]:
print(bucket.oned_bin_modes[bucket.id_attributes[0]])
print(np.sum(bucket.twod_bin_modes[bucket.id_attributes[0]], axis=1))
print(bucket.oned_bin_modes[bucket.id_attributes[1]])
print(np.sum(bucket.twod_bin_modes[bucket.id_attributes[1]], axis=0))

In [6]:
a = np.arange(15)

In [2]:
import jenkspy

In [4]:
breaks = jenkspy.jenks_breaks(np.random.randint(0, 10, 200), nb_class=5)
print(breaks)

[0.0, 1.0, 3.0, 5.0, 7.0, 9.0]


In [12]:
from scipy import stats

In [15]:
stats.mode(a).count[0]

1