In [1]:
import pickle
import sys
sys.path.append("../")
from Join_scheme.data_prepare import process_stats_data
from BayesCard.Models.Bayescard_BN import Bayescard_BN
import time
import pandas as pd
import numpy as np
from BayesCard.Evaluation.cardinality_estimation import parse_query_single_table

In [2]:
from Join_scheme.data_prepare import process_stats_data
data_path = "/home/ubuntu/End-to-End-CardEst-Benchmark/datasets/stats_simplified/{}.csv"
model_folder = "/home/ubuntu/data_CE/saved_models"
data, null_values, key_attrs, table_buckets, equivalent_keys, schema, bin_size, all_bin_means, all_bin_width = process_stats_data(data_path,
                                                model_folder, 200, "sub_optimal", return_bin_means=True)

bucketizing equivalent key group: {'comments.PostId', 'postLinks.PostId', 'postLinks.RelatedPostId', 'votes.PostId', 'postHistory.PostId', 'posts.Id', 'tags.ExcerptPostId'}
bucketizing equivalent key group: {'posts.OwnerUserId', 'postHistory.UserId', 'votes.UserId', 'users.Id', 'badges.UserId', 'comments.UserId'}


In [3]:
all_bin_means['tags.ExcerptPostId'] = np.ones(48)
all_bin_width['tags.ExcerptPostId'] = np.ones(48) * len(data["tags"]["tags.ExcerptPostId"])/48
all_bin_means['posts.Id'] = np.ones(48)
all_bin_width['posts.Id'] = np.ones(48) * len(data["posts"]["posts.Id"])/48
all_bin_means['users.Id'] = np.ones(107)
all_bin_width['users.Id'] = np.ones(107) * len(data["users"]["users.Id"])/107


In [4]:
def learn_histogram(data, key_attrs, all_bin_means, all_bin_width, all_bin_size, bin_size=50):
    all_histogram = dict()
    all_boundary = dict()
    for table in data:
        all_histogram[table] = dict()
        all_boundary[table] = dict()
        for attr in data[table]:
            if attr in key_attrs[table]:
                assert all_bin_size[table][attr] == len(all_bin_means[attr]) == len(all_bin_width[attr])
            else:
                hist, curr_bins = np.histogram(data[table][attr].values, bins=bin_size)
                all_histogram[table][attr] = hist/np.sum(hist)
                all_boundary[table][attr] = curr_bins
    return all_histogram, all_boundary

In [5]:
all_histogram, all_boundary = learn_histogram(data, key_attrs, all_bin_means, all_bin_width, bin_size, bin_size=50)

In [21]:
import numpy as np
import copy

from Join_scheme.join_graph import process_condition, get_join_hyper_graph
from Join_scheme.data_prepare import identify_key_values
from BayesCard.Evaluation.cardinality_estimation import timestamp_transorform, construct_table_query

OPS = {
    '>': np.greater,
    '<': np.less,
    '>=': np.greater_equal,
    '<=': np.less_equal,
    '=': np.equal,
    '==': np.equal
}

class Bound_ensemble:
    """
    This the class where we store all the trained models and perform inference on the bound.
    """
    def __init__(self, hist, boundary, all_bin_means, all_key_size, schema):
        self.hist = hist
        self.boundary = boundary
        self.schema = schema
        self.all_bin_means = all_bin_means
        self.all_key_size = all_key_size
        self.all_keys, self.equivalent_keys = identify_key_values(schema)

    def parse_query_simple(self, query):
        """
        If your selection query contains no aggregation and nested sub-queries, you can use this function to parse a
        join query. Otherwise, use parse_query function.
        """
        query = query.replace(" where ", " WHERE ")
        query = query.replace(" from ", " FROM ")
        query = query.replace(" and ", " AND ")
        query = query.split(";")[0]
        query = query.strip()
        tables_all = {}
        join_cond = []
        table_probs = {}
        join_keys = {}
        tables_str = query.split(" WHERE ")[0].split(" FROM ")[-1]
        for table_str in tables_str.split(","):
            table_str = table_str.strip()
            if " as " in table_str:
                tables_all[table_str.split(" as ")[-1]] = table_str.split(" as ")[0]
            else:
                tables_all[table_str.split(" ")[-1]] = table_str.split(" ")[0]

        # processing conditions
        conditions = query.split(" WHERE ")[-1].split(" AND ")
        for cond in conditions:
            table, cond, join, join_key = process_condition(cond, tables_all)
            if table not in table_probs:
                table_probs[table] = 1
            if not join:
                attr = cond[0]
                op = cond[1]
                value = cond[2]
                if "Date" in attr:
                    assert "::timestamp" in value
                    value = timestamp_transorform(value.strip().split("::timestamp")[0])
                curr_prob = 0
                for i in range(0, len(self.boundary[table][attr])-1):
                    if OPS[op](self.boundary[table][attr][i], value):
                        curr_prob += self.hist[table][attr][i]
                table_probs[table] *= curr_prob
                #construct_table_query(self.bns[table], table_query[table], attr, op, value)
            else:
                join_cond.append(cond)
                for tab in join_key:
                    if tab in join_keys:
                        join_keys[tab].add(join_key[tab])
                    else:
                        join_keys[tab] = set([join_key[tab]])
        final_probs = 1
        for table in table_probs:
            final_probs *= table_probs[table]
        #print(final_probs)
        if final_probs == 0:
            final_probs = 0.001
        return tables_all, final_probs, join_cond, join_keys
    
    def multiply_hist_oned(self, all_probs, all_means):
        all_probs = np.stack(all_probs, axis=0)
        all_means = np.stack(all_means, axis=0)
        multiplier = np.prod(all_means, axis=0)
        min_number = np.amin(all_probs, axis=0)
        multiplier = multiplier * min_number
        return np.sum(multiplier)

    def eliminate_one_key_group(self, key_group, relevant_keys, res):
        all_means = []
        all_probs = []
        for key in relevant_keys:
            if res:
                hist = self.all_bin_means[key] * self.all_key_size[key]
                #print("key", np.sum(hist))
                ratio = res/np.sum(hist)
                all_means.append(self.all_bin_means[key]*ratio)
                all_probs.append(self.all_key_size[key])
            else:
                #print(key, np.sum(self.all_bin_means[key] * self.all_key_size[key]))
                all_means.append(self.all_bin_means[key])
                all_probs.append(self.all_key_size[key])
        return self.multiply_hist_oned(all_probs, all_means)
        

    def get_cardinality(self, query_str):
        tables_all, table_probs, join_cond, join_keys = self.parse_query_simple(query_str)
        equivalent_group = get_join_hyper_graph(join_keys, self.equivalent_keys)
        res = None
        for key_group in equivalent_group:
            res = self.eliminate_one_key_group(key_group, equivalent_group[key_group], res)
        if res <= 1:
            res = 1
        return res * table_probs

In [22]:
BE = Bound_ensemble(all_histogram, all_boundary, all_bin_means, all_bin_width, schema)

In [23]:
query_file = "/home/ubuntu/End-to-End-CardEst-Benchmark/workloads/stats_CEB/sub_plan_queries/stats_CEB_sub_queries.sql"
with open(query_file, "r") as f:
    queries = f.readlines()

In [24]:
qerror = []
latency = []
pred = []
for i, query_str in enumerate(queries):
    #if i == 10: break
    query = query_str.split("||")[0][:-1]
    print("========================")
    true_card = int(query_str.split("||")[-1])
    t = time.time()
    res = BE.get_cardinality(query)
    pred.append(res)
    latency.append(time.time() - t)
    qerror.append(res/true_card)
    print(f"estimating query {i}: predicted {res}, true_card {true_card}, qerror {res/true_card}, latency {time.time() - t}")

estimating query 0: predicted 47733.7567245126, true_card 79851, qerror 0.5977853342414321, latency 0.0014939308166503906
estimating query 1: predicted 14362954.23812714, true_card 10220614, qerror 1.4052926994530015, latency 0.0013582706451416016
estimating query 2: predicted 34153855.38067131, true_card 1458075, qerror 23.423935929682155, latency 0.0009281635284423828
estimating query 3: predicted 36276489.43055708, true_card 1709781, qerror 21.21703857427184, latency 0.0011403560638427734
estimating query 4: predicted 10393431.07729698, true_card 7491903, qerror 1.3872885270000133, latency 0.0006632804870605469
estimating query 5: predicted 59216.72628281309, true_card 428612, qerror 0.1381592822478444, latency 0.0023183822631835938
estimating query 6: predicted 54248508.49000834, true_card 55900138, qerror 0.9704539278598622, latency 0.0014350414276123047
estimating query 7: predicted 9273.74789895026, true_card 10972, qerror 0.8452194585262723, latency 0.0014050006866455078
estima

estimating query 199: predicted 69284.4791807575, true_card 69701, qerror 0.994024177282356, latency 0.00014162063598632812
estimating query 200: predicted 800313.6381591125, true_card 1057588, qerror 0.7567347947963787, latency 0.000606536865234375
estimating query 201: predicted 107737.21675237826, true_card 206322, qerror 0.522179974759736, latency 0.0006220340728759766
estimating query 202: predicted 293133.48663667217, true_card 282841, qerror 1.0363896558019245, latency 0.00015497207641601562
estimating query 203: predicted 2800763.2452816623, true_card 3460999, qerror 0.8092354968266857, latency 0.0006122589111328125
estimating query 204: predicted 305884.76058022224, true_card 371988, qerror 0.8222973874969682, latency 0.0006239414215087891
estimating query 205: predicted 1384519.9213427505, true_card 1633681, qerror 0.8474848647580222, latency 0.0006115436553955078
estimating query 206: predicted 32835597.553459797, true_card 1423047, qerror 23.07414832641494, latency 0.001283

estimating query 352: predicted 11146391.103208316, true_card 3672, qerror 3035.5095596972537, latency 0.0007736682891845703
estimating query 353: predicted 3130318.252758896, true_card 3829734, qerror 0.8173722385833836, latency 0.0003154277801513672
estimating query 354: predicted 16228.052243840133, true_card 120, qerror 135.23376869866777, latency 0.00041937828063964844
estimating query 355: predicted 14007.014804430999, true_card 16, qerror 875.4384252769374, latency 0.00042724609375
estimating query 356: predicted 989787.756586787, true_card 4930, qerror 200.76830762409475, latency 0.0002760887145996094
estimating query 357: predicted 18298422849.07551, true_card 21963242042, qerror 0.8331385145273039, latency 0.0005533695220947266
estimating query 358: predicted 79474248.33621088, true_card 813, qerror 97754.30299656934, latency 0.0006792545318603516
estimating query 359: predicted 58280386.1236844, true_card 58, qerror 1004834.2435118, latency 0.0006949901580810547
estimating q

estimating query 500: predicted 13791571.470853955, true_card 4801, qerror 2872.6455885969494, latency 0.0013587474822998047
estimating query 501: predicted 1032.0000000000005, true_card 596, qerror 1.7315436241610747, latency 0.00029397010803222656
estimating query 502: predicted 71943.4390281755, true_card 50205, qerror 1.4329935071840554, latency 0.000423431396484375
estimating query 503: predicted 53779977.74814731, true_card 54807156, qerror 0.9812583186791759, latency 0.0006427764892578125
estimating query 504: predicted 3638079.764402453, true_card 3632995, qerror 1.0013996067713975, latency 0.0019457340240478516
estimating query 505: predicted 225309.22900535987, true_card 148177, qerror 1.520541170393245, latency 0.0002911090850830078
estimating query 506: predicted 47673.38669120944, true_card 60582, qerror 0.7869232889506691, latency 0.0010890960693359375
estimating query 507: predicted 16228052.243840128, true_card 15948894, qerror 1.0175032979616097, latency 0.000701665878

estimating query 683: predicted 11366.479960553088, true_card 183601, qerror 0.06190859505423766, latency 0.001153707504272461
estimating query 684: predicted 211.7359659052281, true_card 9723, qerror 0.02177681434796134, latency 0.000982046127319336
estimating query 685: predicted 315.9872864816758, true_card 156174, qerror 0.0020233027679490557, latency 0.0007250308990478516
estimating query 686: predicted 44908.20242069581, true_card 64342, qerror 0.697960934081872, latency 0.000629425048828125
estimating query 687: predicted 118.88251847942932, true_card 12375, qerror 0.009606668159953884, latency 0.0011670589447021484
estimating query 688: predicted 661.8037574180634, true_card 119851, qerror 0.0055218876556563014, latency 0.0012133121490478516
estimating query 689: predicted 207071.7693689906, true_card 428714, qerror 0.48300678160496413, latency 0.0012159347534179688
estimating query 690: predicted 76.37621392273482, true_card 22952, qerror 0.0033276496132247655, latency 0.00372

estimating query 819: predicted 109839964.95400716, true_card 1994249, qerror 55.07836030205213, latency 0.0016722679138183594
estimating query 820: predicted 88119282.2374153, true_card 18218641, qerror 4.836764840880025, latency 0.002373933792114258
estimating query 821: predicted 95376.80626541853, true_card 692609, qerror 0.13770656498171197, latency 0.002480745315551758
estimating query 822: predicted 277.00281439637763, true_card 17103, qerror 0.016196153563490478, latency 0.001027822494506836
estimating query 823: predicted 29667.00046432788, true_card 24679, qerror 1.2021151774515937, latency 0.0006682872772216797
estimating query 824: predicted 276.5018619857278, true_card 161741, qerror 0.0017095347622787533, latency 0.0007562637329101562
estimating query 825: predicted 10383234.71623896, true_card 7472581, qerror 1.3895111630424561, latency 0.0009541511535644531
estimating query 826: predicted 138878.35965920478, true_card 53154, qerror 2.612754631057019, latency 0.001579046

estimating query 962: predicted 319663053423123.9, true_card 26312006, qerror 12148942.707869703, latency 0.000339508056640625
estimating query 963: predicted 1947212459872908.8, true_card 141313952, qerror 13779336.239021245, latency 0.0007710456848144531
estimating query 964: predicted 155558140404533.66, true_card 412908832, qerror 376737.25614213466, latency 0.0007236003875732422
estimating query 965: predicted 2105768.7969738944, true_card 2461248, qerror 0.8555695309752996, latency 0.0008838176727294922
estimating query 966: predicted 11261412.851995625, true_card 10257634, qerror 1.0978567622899809, latency 0.0005843639373779297
estimating query 967: predicted 291072365.87488705, true_card 26312006, qerror 11.062340358043665, latency 0.0002887248992919922
estimating query 968: predicted 1947212459872908.8, true_card 141313952, qerror 13779336.239021245, latency 0.000667572021484375
estimating query 969: predicted 940716916688.8276, true_card 412908832, qerror 2278.267849424029, 

estimating query 1193: predicted 399523.8733004783, true_card 922156, qerror 0.43324976826098655, latency 0.000949859619140625
estimating query 1194: predicted 156.63178989814278, true_card 84576, qerror 0.0018519649770400916, latency 0.001013040542602539
estimating query 1195: predicted 3112795.344662517, true_card 4255848, qerror 0.7314160056145137, latency 0.0001354217529296875
estimating query 1196: predicted 5295.041169776719, true_card 700696, qerror 0.0075568308792639295, latency 0.0007886886596679688
estimating query 1197: predicted 287834.8327710577, true_card 2147406, qerror 0.13403838527556397, latency 0.0008780956268310547
estimating query 1198: predicted 112.84453336798737, true_card 203276, qerror 0.000555129643282962, latency 0.001039266586303711
estimating query 1199: predicted 512759.90781962767, true_card 2012684, qerror 0.25476423910540735, latency 0.0008821487426757812
estimating query 1200: predicted 201.02553944102152, true_card 116602, qerror 0.001724031658470879

estimating query 1359: predicted 29144889.43613269, true_card 48222, qerror 604.389893329449, latency 0.001745462417602539
estimating query 1360: predicted 23122206.600601085, true_card 109211, qerror 211.7204915310828, latency 0.0008344650268554688
estimating query 1361: predicted 1920366047.3312495, true_card 426751, qerror 4499.968476538425, latency 0.0014071464538574219
estimating query 1362: predicted 585.486295685674, true_card 20059, qerror 0.029188209566063814, latency 0.0007040500640869141
estimating query 1363: predicted 253635.19527780623, true_card 529790, qerror 0.4787466642968086, latency 0.0002315044403076172
estimating query 1364: predicted 606306.5812164246, true_card 590979, qerror 1.0259359151787535, latency 0.0004572868347167969
estimating query 1365: predicted 779.0120664885515, true_card 67005, qerror 0.011626178143251271, latency 0.00048422813415527344
estimating query 1366: predicted 1750.86684390808, true_card 71750, qerror 0.024402325350635263, latency 0.00067

estimating query 1550: predicted 250689607.8006051, true_card 3778084, qerror 66.35363528195909, latency 0.002405405044555664
estimating query 1551: predicted 258874505.6531216, true_card 260664622, qerror 0.9931324921151808, latency 0.0005719661712646484
estimating query 1552: predicted 11330458.464746077, true_card 11031325, qerror 1.0271167302881636, latency 0.00012731552124023438
estimating query 1553: predicted 155277.9409182063, true_card 57512, qerror 2.699922466932228, latency 0.0005128383636474609
estimating query 1554: predicted 13786875.33425875, true_card 13983299, qerror 0.9859529810711156, latency 0.0005538463592529297
estimating query 1555: predicted 220915.75934965615, true_card 139728, qerror 1.5810414473094594, latency 0.0009601116180419922
estimating query 1556: predicted 28995.58213374342, true_card 17404, qerror 1.6660297709574479, latency 0.0005152225494384766
estimating query 1557: predicted 57364429833.08039, true_card 57449461905, qerror 0.9985198804462221, lat

estimating query 1751: predicted 15657.856892093014, true_card 14309, qerror 1.094266328331331, latency 0.0007851123809814453
estimating query 1752: predicted 15904875.75359579, true_card 1242771, qerror 12.7979134962079, latency 0.0009424686431884766
estimating query 1753: predicted 79474248.33621088, true_card 986987, qerror 80.52208219177241, latency 0.0010623931884765625
estimating query 1754: predicted 1111.2278940973895, true_card 217448, qerror 0.005110315542554493, latency 0.0006897449493408203
estimating query 1755: predicted 889.476313761421, true_card 977123, qerror 0.0009103012760537016, latency 0.0005309581756591797
estimating query 1756: predicted 196.3666666666667, true_card 109665, qerror 0.0017906047204364812, latency 0.0005280971527099609
estimating query 1757: predicted 929.6599618309599, true_card 31729, qerror 0.02930000825210249, latency 0.0008065700531005859
estimating query 1758: predicted 889.476313761421, true_card 645020, qerror 0.001378990285202662, latency 

estimating query 1943: predicted 39846.749405114904, true_card 28359, qerror 1.4050830214434538, latency 0.0009443759918212891
estimating query 1944: predicted 270830.9797424018, true_card 659714, qerror 0.4105278647147124, latency 0.0015528202056884766
estimating query 1945: predicted 122236.58737905028, true_card 50737, qerror 2.4092198470356996, latency 0.001348733901977539
estimating query 1946: predicted 189864.9861345336, true_card 203839, qerror 0.9314458280041287, latency 0.0008897781372070312
estimating query 1947: predicted 992574.3564496896, true_card 5817, qerror 170.63337741957875, latency 0.001264810562133789
estimating query 1948: predicted 280738.31887999235, true_card 199310, qerror 1.408551095680058, latency 0.0012464523315429688
estimating query 1949: predicted 44291.8364929924, true_card 61674, qerror 0.7181605943021759, latency 0.00067901611328125
estimating query 1950: predicted 189125.97090190602, true_card 185637, qerror 1.018794587834893, latency 0.001092910766

estimating query 2099: predicted 258408149.3166984, true_card 1765715, qerror 146.34759817790436, latency 0.0015418529510498047
estimating query 2100: predicted 15384008.22652799, true_card 410288, qerror 37.49563288842957, latency 0.0009524822235107422
estimating query 2101: predicted 672303.9703368933, true_card 150888972, qerror 0.004455620324173812, latency 0.0009179115295410156
estimating query 2102: predicted 2922.0998607087367, true_card 275324, qerror 0.010613313262587848, latency 0.001239776611328125
estimating query 2103: predicted 197.6726807621989, true_card 69518, qerror 0.0028434747944733577, latency 0.0013344287872314453
estimating query 2104: predicted 16145230.234148117, true_card 869316, qerror 18.572337601226845, latency 0.000530242919921875
estimating query 2105: predicted 5195861655.4460745, true_card 422358560837, qerror 0.012302015721308662, latency 0.0014204978942871094
estimating query 2106: predicted 14984628.16911041, true_card 11874391, qerror 1.261928141755

estimating query 2254: predicted 12486245.205039399, true_card 16500753, qerror 0.7567076002555397, latency 0.0026535987854003906
estimating query 2255: predicted 99365974.66992901, true_card 79337134, qerror 1.2524522838186847, latency 0.0016715526580810547
estimating query 2256: predicted 35790891675885.195, true_card 279208161, qerror 128187.1258622888, latency 0.0015370845794677734
estimating query 2257: predicted 4198968429.8968177, true_card 113925678, qerror 36.85708528236117, latency 0.001973867416381836
estimating query 2258: predicted 69691.0848718897, true_card 171647, qerror 0.40601399891573814, latency 0.0007207393646240234
estimating query 2259: predicted 40041.734239736186, true_card 30393, qerror 1.3174656743242255, latency 0.00023937225341796875
estimating query 2260: predicted 276501.8619857278, true_card 704085, qerror 0.3927109113043564, latency 0.00010633468627929688
estimating query 2261: predicted 661803.7574180634, true_card 865125, qerror 0.7649805027228012, la

estimating query 2410: predicted 273059.5680727073, true_card 699208, qerror 0.39052695059654247, latency 0.0013055801391601562
estimating query 2411: predicted 661.8037574180634, true_card 668933, qerror 0.000989342366751324, latency 0.0008924007415771484
estimating query 2412: predicted 14936979.42129138, true_card 14929017, qerror 1.000533352014495, latency 0.0007719993591308594
estimating query 2413: predicted 154131.44740331924, true_card 170249, qerror 0.9053295314704888, latency 0.00054168701171875
estimating query 2414: predicted 90.56323166080192, true_card 6021, qerror 0.015041227646703525, latency 0.0005242824554443359
estimating query 2415: predicted 796.9934308895038, true_card 71359, qerror 0.011168786430436298, latency 0.0003750324249267578
estimating query 2416: predicted 196.3666666666667, true_card 55484, qerror 0.0035391584360656534, latency 0.000762939453125
estimating query 2417: predicted 45174.89361359186, true_card 65334, qerror 0.6914453977039805, latency 0.000

estimating query 2559: predicted 122.93489030626888, true_card 87188, qerror 0.001409997824313769, latency 0.002289295196533203
estimating query 2560: predicted 261.4710885698841, true_card 436286, qerror 0.0005993112054246162, latency 0.0020537376403808594
estimating query 2561: predicted 24020.03670650935, true_card 169983, qerror 0.14130846441414346, latency 0.0011882781982421875
estimating query 2562: predicted 30855.937138768593, true_card 33456, qerror 0.9222841086432506, latency 0.000560760498046875
estimating query 2563: predicted 260758.7518083654, true_card 679915, qerror 0.38351669224589163, latency 0.0005955696105957031
estimating query 2564: predicted 653564.6698784218, true_card 853748, qerror 0.7655241006461179, latency 0.00033402442932128906
estimating query 2565: predicted 3119.958259541344, true_card 10494, qerror 0.29730877258827365, latency 0.0009844303131103516
estimating query 2566: predicted 28348.80330054993, true_card 289244, qerror 0.0980099960605922, latency 

In [25]:
for i in [50, 90, 95, 99, 100]:
    print(f"q-error {i}% percentile is {np.percentile(qerror, i)}")
print(f"total inference time: {np.sum(latency)}")

q-error 50% percentile is 0.7649805027228012
q-error 90% percentile is 179.42461836380116
q-error 95% percentile is 2731.7018756828616
q-error 99% percentile is 4482471.18758561
q-error 100% percentile is 114172012572.37447
total inference time: 2.763202667236328


In [26]:
with open("stats_CEB_join_hist.txt", "w") as f:
    for p in pred:
        f.write(str(p)+"\n")