In [5]:
import numpy as np
import pandas as pd
import pickle
from scipy import stats
import time
import sys
sys.path.append("../")
from Schemas.stats.schema import gen_stats_light_schema
from Evaluation.training import train_one_stats, test_trained_BN_on_stats
from Join_scheme.data_prepare import process_stats_data, update_stats_data
from BayesCard.Models.Bayescard_BN import Bayescard_BN
from BayesCard.Evaluation.cardinality_estimation import parse_query_single_table

In [6]:
data_folder = "/Users/ziniuw/Desktop/research/Learned_QO/data/stats_simplified/{}.csv"

In [7]:
def read_table_csv(table_obj, csv_seperator=',', stats=True):
    """
    Reads csv from path, renames columns and drops unnecessary columns
    """
    if stats:
        df_rows = pd.read_csv(table_obj.csv_file_location)
    else:
        df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8',
                              quotechar='"',
                              sep=csv_seperator)
    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    return df_rows.apply(pd.to_numeric, errors="ignore")


def timestamp_transorform(time_string, start_date="2010-07-19 00:00:00"):
    start_date_int = time.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    time_array = time.strptime(time_string, "%Y-%m-%d %H:%M:%S")
    return int(time.mktime(time_array)) - int(time.mktime(start_date_int))


def get_data_by_date(data_path, time_date="2014-01-01 00:00:00"):
    time_value = timestamp_transorform(time_date)
    if not data_path.endswith(".csv"):
        data_path += "/{}.csv"
    schema = gen_stats_light_schema(data_path)
    before_data = dict()
    after_data = dict()
    for table_obj in schema.tables:
        table_name = table_obj.table_name
        df_rows = read_table_csv(table_obj)
        idx = len(df_rows)
        for attribute in df_rows.columns:
            if "Date" in attribute:
                idx = np.searchsorted(df_rows[attribute].values, time_value)
                break
                
        before_data[table_name] = df_rows[:idx] if idx > 0 else None
        after_data[table_name] = df_rows[idx:] if idx < len(df_rows) else None
    return before_data, after_data

def get_data(data_path):
    if not data_path.endswith(".csv"):
        data_path += "/{}.csv"
    schema = gen_stats_light_schema(data_path)
    data = dict()
    for table_obj in schema.tables:
        table_name = table_obj.table_name
        df_rows = read_table_csv(table_obj)
        data[table_name] = df_rows
    return data


In [8]:
before_data, after_data = get_data_by_date(data_folder)
all_data = get_data(data_folder)

In [9]:
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
data, null_values, key_attrs, table_buckets, equivalent_keys, schema, bin_size = process_stats_data(data_folder,
                                        model_path, 200, "sub_optimal", False, actual_data=before_data)


bucketizing equivalent key group: {'votes.PostId', 'postLinks.PostId', 'posts.Id', 'comments.PostId', 'tags.ExcerptPostId', 'postLinks.RelatedPostId', 'postHistory.PostId'}
bucketizing equivalent key group: {'badges.UserId', 'users.Id', 'comments.UserId', 'postHistory.UserId', 'posts.OwnerUserId', 'votes.UserId'}


In [None]:
t_name = "postHistory"
bn = Bayescard_BN(t_name, key_attrs[t_name], bin_size[t_name], null_values=null_values[t_name])
bn.build_from_data(data[t_name])

In [11]:
model_path = "/Users/ziniuw/Desktop/research/Learned_QO/data/saved_models/"
train_one_stats("stats", data_folder, model_path, 2, 200, "sub_optimal", True, actual_data=before_data)

bucketizing equivalent key group: {'votes.PostId', 'postLinks.PostId', 'posts.Id', 'comments.PostId', 'tags.ExcerptPostId', 'postLinks.RelatedPostId', 'postHistory.PostId'}
bucketizing equivalent key group: {'badges.UserId', 'users.Id', 'comments.UserId', 'postHistory.UserId', 'posts.OwnerUserId', 'votes.UserId'}
badges
Discretizing table takes 0.015569925308227539 secs
Structure learning took 0.21733689308166504 secs.
done, parameter learning took 0.06353521347045898 secs.
votes
Discretizing table takes 1.2292289733886719 secs
Structure learning took 1.297508955001831 secs.
done, parameter learning took 0.0673990249633789 secs.
postHistory
Discretizing table takes 0.06839895248413086 secs
Structure learning took 1.7427341938018799 secs.
done, parameter learning took 0.12562084197998047 secs.
posts
Discretizing table takes 0.2793567180633545 secs
Structure learning took 1.6007959842681885 secs.
done, parameter learning took 0.16286301612854004 secs.
users
Discretizing table takes 1.241

In [12]:
with open("/Users/ziniuw/Desktop/research/Learned_QO/data/saved_models/buckets.pkl", "rb") as f:
    buckets = pickle.load(f)
with open("/Users/ziniuw/Desktop/research/Learned_QO/data/saved_models/model_stats_sub_optimal_200.pkl", "rb") as f:
    FJmodel = pickle.load(f)

In [13]:
query_file = "/Users/ziniuw/Desktop/research/Learned_QO/data/stats_simplified/stats_CEB_sub_queries.sql"
with open(query_file, "r") as f:
    queries = f.readlines()


In [14]:
qerror = []
latency = []
pred = []
for i, query_str in enumerate(queries):
    query = query_str.split("||")[0][:-1]
    print("========================")
    true_card = int(query_str.split("||")[-1])
    t = time.time()
    res = FJmodel.get_cardinality_bound_one(query)
    pred.append(res)
    latency.append(time.time() - t)
    qerror.append(res/true_card)
    print(f"estimating query {i}: predicted {res}, true_card {true_card}, qerror {res/true_card}, latency {time.time() - t}")

estimating query 0: predicted 54920.0, true_card 79851, qerror 0.6877809920977821, latency 0.010627269744873047
estimating query 1: predicted 6634616.999999996, true_card 10220614, qerror 0.6491407463387225, latency 0.013267993927001953
estimating query 2: predicted 2224733.0, true_card 1458075, qerror 1.5258014848344563, latency 0.008116960525512695
estimating query 3: predicted 2256146.713923665, true_card 1709781, qerror 1.3195530386193701, latency 0.009044885635375977
estimating query 4: predicted 5774461.999999999, true_card 7491903, qerror 0.7707603795724529, latency 0.00740504264831543
estimating query 5: predicted 577314.3126980041, true_card 428612, qerror 1.3469392193825747, latency 0.008238077163696289
estimating query 6: predicted 33096680.247657806, true_card 55900138, qerror 0.5920679524558206, latency 0.007537126541137695
estimating query 7: predicted 7191.000000000002, true_card 10972, qerror 0.6553955523149838, latency 0.005619049072265625
estimating query 8: predicted

estimating query 58: predicted 24764.87604491237, true_card 31965, qerror 0.7747497589523656, latency 0.010013818740844727
estimating query 59: predicted 24079.479201561866, true_card 26836, qerror 0.8972827247563671, latency 0.018220901489257812
estimating query 60: predicted 2243674.2445299625, true_card 2704241, qerror 0.8296872373911802, latency 0.004176139831542969
estimating query 61: predicted 24802.214955523268, true_card 32918, qerror 0.7534544916314256, latency 0.0038542747497558594
estimating query 62: predicted 62888.13620660873, true_card 86112, qerror 0.7303063011729926, latency 0.004645824432373047
estimating query 63: predicted 2143942.3005878557, true_card 2488080, qerror 0.8616854363958778, latency 0.005914211273193359
estimating query 64: predicted 949396.0, true_card 1056687, qerror 0.8984647298585106, latency 0.0023779869079589844
estimating query 65: predicted 22294.395093399944, true_card 20334, qerror 1.0964097124717196, latency 0.003798961639404297
estimating q

estimating query 117: predicted 887150.3009236722, true_card 595820, qerror 1.488956901285073, latency 0.018607139587402344
estimating query 118: predicted 1361695.979874585, true_card 1187068, qerror 1.1471086575281155, latency 0.005686044692993164
estimating query 119: predicted 949396.0, true_card 1056687, qerror 0.8984647298585106, latency 0.002401113510131836
estimating query 120: predicted 24969.0, true_card 34789, qerror 0.7177268676880623, latency 0.002238035202026367
estimating query 121: predicted 1177408.6521548855, true_card 1486002, qerror 0.7923331544337662, latency 0.004884004592895508
estimating query 122: predicted 24748.17956538627, true_card 37048, qerror 0.668003119342104, latency 0.004334926605224609
estimating query 123: predicted 54920.0, true_card 79851, qerror 0.6877809920977821, latency 0.0010340213775634766
estimating query 124: predicted 121367108.5810282, true_card 232039659, qerror 0.5230446773800344, latency 0.006112813949584961
estimating query 125: pred

estimating query 177: predicted 706.9719693599908, true_card 913, qerror 0.7743395064183908, latency 0.009896993637084961
estimating query 178: predicted 53359.525123096995, true_card 68895, qerror 0.7745050456941287, latency 0.003949642181396484
estimating query 179: predicted 30071842.999999985, true_card 91539518, qerror 0.32851214051618655, latency 0.009258031845092773
estimating query 180: predicted 168982.08176592021, true_card 70700, qerror 2.3901284549635107, latency 0.011445999145507812
estimating query 181: predicted 4716207.434891367, true_card 1984289, qerror 2.376774469289185, latency 0.005767822265625
estimating query 182: predicted 29035.63870167997, true_card 18346, qerror 1.5826686308557707, latency 0.009877920150756836
estimating query 183: predicted 19219805.679587316, true_card 3169724, qerror 6.063558114077855, latency 0.011773347854614258
estimating query 184: predicted 211320.2737914746, true_card 345663, qerror 0.611347681966177, latency 0.008586883544921875
est

estimating query 242: predicted 22287.316910170237, true_card 5044, qerror 4.418579879097985, latency 0.009063959121704102
estimating query 243: predicted 171221.11993579773, true_card 1717, qerror 99.7210948956306, latency 0.014568090438842773
estimating query 244: predicted 214695.99999999997, true_card 213642, qerror 1.0049334868611977, latency 0.004837989807128906
estimating query 245: predicted 1039890.0, true_card 1331347, qerror 0.7810811155919531, latency 0.0025718212127685547
estimating query 246: predicted 137437.71052934736, true_card 150600, qerror 0.9126009995308589, latency 0.0029981136322021484
estimating query 247: predicted 27412.077844539683, true_card 41569, qerror 0.6594355852808507, latency 0.0035991668701171875
estimating query 248: predicted 1039890.0, true_card 915414, qerror 1.1359778198716646, latency 0.004544973373413086
estimating query 249: predicted 213711.70844690394, true_card 209583, qerror 1.0196996342589997, latency 0.007542133331298828
estimating que

estimating query 303: predicted 665365.4453221681, true_card 334037, qerror 1.991891453108991, latency 0.011473894119262695
estimating query 304: predicted 782012.7129523747, true_card 403975, qerror 1.9357948213438325, latency 0.009913206100463867
estimating query 305: predicted 3528943.6643128246, true_card 69251, qerror 50.95873943066273, latency 0.009159088134765625
estimating query 306: predicted 778735.3170127147, true_card 65948, qerror 11.80832348233024, latency 0.011260986328125
estimating query 307: predicted 10484145.486354617, true_card 988335, qerror 10.60788648216912, latency 0.012093067169189453
estimating query 308: predicted 18934991.617573183, true_card 1730990, qerror 10.938822071515828, latency 0.010751962661743164
estimating query 309: predicted 570696.8602667785, true_card 114676, qerror 4.97660243003574, latency 0.01244211196899414
estimating query 310: predicted 15802325.781034345, true_card 454094, qerror 34.7996797602134, latency 0.01389002799987793
estimating

estimating query 376: predicted 246326164.3531798, true_card 20762610, qerror 11.863930611478027, latency 0.006717205047607422
estimating query 377: predicted 1183957.0774934362, true_card 3254, qerror 363.8466740914063, latency 0.007288217544555664
estimating query 378: predicted 1023654.1062278796, true_card 201495, qerror 5.0802953235955215, latency 0.007756948471069336
estimating query 379: predicted 240907953.60019144, true_card 16621776, qerror 14.493514628051265, latency 0.010118961334228516
estimating query 380: predicted 1552063.2560711328, true_card 18142, qerror 85.5508354134678, latency 0.00869607925415039
estimating query 381: predicted 1744118.3262061386, true_card 876769, qerror 1.9892563790532496, latency 0.00867319107055664
estimating query 382: predicted 743857.7308418495, true_card 9988, qerror 74.47514325609225, latency 0.007277965545654297
estimating query 383: predicted 165111008816.31937, true_card 27903957956, qerror 5.917117889751431, latency 0.0093190670013427

estimating query 440: predicted 8184216710.944752, true_card 86914174, qerror 94.16435012020884, latency 0.008880138397216797
estimating query 441: predicted 425388.25754127203, true_card 311724, qerror 1.3646310760200435, latency 0.005372285842895508
estimating query 442: predicted 144535562.99999997, true_card 263105194, qerror 0.5493451527984657, latency 0.002470254898071289
estimating query 443: predicted 9869471.272635838, true_card 15716615, qerror 0.6279641813861215, latency 0.003673076629638672
estimating query 444: predicted 62815.86255077953, true_card 66480, qerror 0.9448836123763467, latency 0.006161928176879883
estimating query 445: predicted 355636.7327472204, true_card 494123, qerror 0.7197332096405559, latency 0.004658937454223633
estimating query 446: predicted 110724.46153539656, true_card 73766, qerror 1.5010229853238153, latency 0.005425214767456055
estimating query 447: predicted 4212.479233417791, true_card 4137, qerror 1.018244919849599, latency 0.007674932479858

estimating query 502: predicted 47359.306263916726, true_card 50205, qerror 0.9433185193490036, latency 0.004896879196166992
estimating query 503: predicted 32100732.99999999, true_card 54807156, qerror 0.585703315822481, latency 0.0032160282135009766
estimating query 504: predicted 2711703.999999999, true_card 3632995, qerror 0.7464100556152703, latency 0.004391908645629883
estimating query 505: predicted 114452.97473544686, true_card 148177, qerror 0.7724071531711862, latency 0.002914905548095703
estimating query 506: predicted 49318.2452174932, true_card 60582, qerror 0.8140742335593608, latency 0.004148244857788086
estimating query 507: predicted 9680005.999999994, true_card 15948894, qerror 0.6069390140783426, latency 0.003353118896484375
estimating query 508: predicted 537.0000000000002, true_card 26, qerror 20.653846153846164, latency 0.0068209171295166016
estimating query 509: predicted 14707245.607190315, true_card 1582530, qerror 9.293501928677696, latency 0.007965087890625
e

estimating query 559: predicted 13292151.612831261, true_card 21426821, qerror 0.6203510830109265, latency 0.004940986633300781
estimating query 560: predicted 2244865.9524915917, true_card 2712537, qerror 0.8275890623765101, latency 0.008096933364868164
estimating query 561: predicted 2702698.970703454, true_card 3676560, qerror 0.735116242004334, latency 0.0044820308685302734
estimating query 562: predicted 62219.150634976744, true_card 79662, qerror 0.7810392738693072, latency 0.007627964019775391
estimating query 563: predicted 4706218.408894064, true_card 6053427, qerror 0.7774469583748287, latency 0.008363008499145508
estimating query 564: predicted 3986684.0526696276, true_card 6167152, qerror 0.6464384293867944, latency 0.0047419071197509766
estimating query 565: predicted 45052.04066272676, true_card 58416, qerror 0.771227757168015, latency 0.007523059844970703
estimating query 566: predicted 944761.3353596756, true_card 1052150, qerror 0.8979340734302862, latency 0.0080587863

estimating query 616: predicted 37365468.35444408, true_card 35509433, qerror 1.0522687972642109, latency 0.004266977310180664
estimating query 617: predicted 489536.1229294569, true_card 113661, qerror 4.306984127620352, latency 0.010050058364868164
estimating query 618: predicted 21481251.183531374, true_card 15012180, qerror 1.4309215039741978, latency 0.008717775344848633
estimating query 619: predicted 440598.56889730965, true_card 46721, qerror 9.430418203747987, latency 0.014620780944824219
estimating query 620: predicted 238818.39550441402, true_card 79008, qerror 3.022711567238938, latency 0.00972890853881836
estimating query 621: predicted 272347270.65197563, true_card 510775350, qerror 0.533203629838393, latency 0.007132053375244141
estimating query 622: predicted 1476093.2103782622, true_card 165958, qerror 8.894378158198233, latency 0.013342142105102539
estimating query 623: predicted 1269614.443181675, true_card 328282, qerror 3.867450677105887, latency 0.0084950923919677

estimating query 688: predicted 521033.0045773301, true_card 119851, qerror 4.347339651545086, latency 0.008691787719726562
estimating query 689: predicted 1645325.5534161855, true_card 428714, qerror 3.8378162444337844, latency 0.009225130081176758
estimating query 690: predicted 117993.68303836914, true_card 22952, qerror 5.140888943811831, latency 0.013981819152832031
estimating query 691: predicted 2498486.5823326833, true_card 572851, qerror 4.361494668478685, latency 0.009739875793457031
estimating query 692: predicted 39414.75640327008, true_card 29438, qerror 1.3389074122994118, latency 0.00905609130859375
estimating query 693: predicted 1181809.5325862542, true_card 98645, qerror 11.980430154455412, latency 0.00945591926574707
estimating query 694: predicted 4568.281744122971, true_card 3268, qerror 1.397883030637384, latency 0.012495040893554688
estimating query 695: predicted 982140.0933555645, true_card 938446, qerror 1.0465600507174249, latency 0.013343095779418945
estimat

estimating query 749: predicted 269798.00658894435, true_card 109961, qerror 2.453579056110297, latency 0.011879920959472656
estimating query 750: predicted 97287.0, true_card 65658, qerror 1.4817234761948277, latency 0.005445241928100586
estimating query 751: predicted 152764.74682740882, true_card 18301, qerror 8.347344234053265, latency 0.003922939300537109
estimating query 752: predicted 517230.6183725218, true_card 4185, qerror 123.59154560872685, latency 0.024351119995117188
estimating query 753: predicted 6241.159836075987, true_card 7995, qerror 0.7806328750564087, latency 0.005568981170654297
estimating query 754: predicted 131455.27845441326, true_card 998, qerror 131.71871588618563, latency 0.025381803512573242
estimating query 755: predicted 571067.6486788463, true_card 430179, qerror 1.3275116839242416, latency 0.005517244338989258
estimating query 756: predicted 25835.56197594719, true_card 35623, qerror 0.7252494729794567, latency 0.006663084030151367
estimating query 75

estimating query 814: predicted 7645855.708206684, true_card 3059536, qerror 2.4990245933392137, latency 0.01982283592224121
estimating query 815: predicted 5720857.341294819, true_card 47513578, qerror 0.12040468392624143, latency 0.019452810287475586
estimating query 816: predicted 3005936.24017304, true_card 692609, qerror 4.340019029745556, latency 0.017625808715820312
estimating query 817: predicted 3005936.24017304, true_card 9130, qerror 329.2372661744841, latency 0.014946937561035156
estimating query 818: predicted 7645855.708206684, true_card 125401, qerror 60.97124989598715, latency 0.020776033401489258
estimating query 819: predicted 2313027.640105966, true_card 1994249, qerror 1.1598489657540088, latency 0.01888298988342285
estimating query 820: predicted 3888751.897343668, true_card 18218641, qerror 0.21344906556661764, latency 0.018784046173095703
estimating query 821: predicted 3005936.24017304, true_card 692609, qerror 4.340019029745556, latency 0.021319150924682617
est

estimating query 872: predicted 7595848.729567398, true_card 98084, qerror 77.44228140744055, latency 0.031059980392456055
estimating query 873: predicted 202150.2180833354, true_card 34509, qerror 5.857898463685862, latency 0.009773015975952148
estimating query 874: predicted 1846421.2992158486, true_card 338555, qerror 5.453829656084975, latency 0.03015303611755371
estimating query 875: predicted 1783186.9465833842, true_card 164566, qerror 10.835694776462843, latency 0.011246204376220703
estimating query 876: predicted 7000810.433935726, true_card 356034, qerror 19.663319890616417, latency 0.027891159057617188
estimating query 877: predicted 136998.90736224773, true_card 10145, qerror 13.504081553696178, latency 0.004869222640991211
estimating query 878: predicted 3927843.137193097, true_card 29571, qerror 132.82753837182025, latency 0.006222963333129883
estimating query 879: predicted 7595848.729567398, true_card 86857, qerror 87.45234960414703, latency 0.03115391731262207
estimati

estimating query 929: predicted 1952266.0813034298, true_card 469, qerror 4162.614245849531, latency 0.016218900680541992
estimating query 930: predicted 126889.0, true_card 174305, qerror 0.7279710851668053, latency 0.0005228519439697266
estimating query 931: predicted 175806.0, true_card 33326, qerror 5.275340574926484, latency 0.0015337467193603516
estimating query 932: predicted 2512418.0, true_card 704085, qerror 3.5683447311049092, latency 0.0027909278869628906
estimating query 933: predicted 676416.0, true_card 862828, qerror 0.7839523056739003, latency 0.021456003189086914
estimating query 934: predicted 9963550.999999998, true_card 15900001, qerror 0.6266383882617365, latency 0.0024547576904296875
estimating query 935: predicted 7191.0, true_card 11102, qerror 0.6477211313276887, latency 0.001043081283569336
estimating query 936: predicted 214695.99999999997, true_card 303187, qerror 0.7081306256534745, latency 0.002395153045654297
estimating query 937: predicted 245333.999999

estimating query 1001: predicted 10872868.0, true_card 32280163, qerror 0.33682816285655065, latency 0.011472225189208984
estimating query 1002: predicted 10839122.150722621, true_card 18999553, qerror 0.5704935348069832, latency 0.01429295539855957
estimating query 1003: predicted 45729974.65872352, true_card 3825442, qerror 11.954167559911644, latency 0.010017156600952148
estimating query 1004: predicted 3179897.961960524, true_card 11526851, qerror 0.27586874871207445, latency 0.009632110595703125
estimating query 1005: predicted 2736817.5012277425, true_card 42172701, qerror 0.06489547589630891, latency 0.011468172073364258
estimating query 1006: predicted 26822668.0, true_card 227354931, qerror 0.11797706731946798, latency 0.012717008590698242
estimating query 1007: predicted 1353471.6119813493, true_card 507603, qerror 2.6663979763345553, latency 0.009449005126953125
estimating query 1008: predicted 30158309.617819563, true_card 1219820, qerror 24.72357365662111, latency 0.010847

estimating query 1073: predicted 10855133.865634272, true_card 3814706, qerror 2.8456016966010673, latency 0.00947713851928711
estimating query 1074: predicted 7934415.012515765, true_card 97054, qerror 81.75258116631736, latency 0.0043070316314697266
estimating query 1075: predicted 10872868.0, true_card 15900001, qerror 0.6838281330925703, latency 0.009386062622070312
estimating query 1076: predicted 10845941.242065629, true_card 9387994, qerror 1.1552991237601589, latency 0.00933384895324707
estimating query 1077: predicted 127971.84282214443, true_card 11102, qerror 11.526917926692887, latency 0.003618001937866211
estimating query 1078: predicted 116102.20980773504, true_card 130148, qerror 0.8920783247359547, latency 0.0034492015838623047
estimating query 1079: predicted 300175.25303719135, true_card 33691, qerror 8.909656971808237, latency 0.004369258880615234
estimating query 1080: predicted 1856242.2432154836, true_card 130148, qerror 14.262549122656388, latency 0.0047500133514

estimating query 1138: predicted 890684.9093026142, true_card 4365157, qerror 0.20404418656708434, latency 0.010818004608154297
estimating query 1139: predicted 882979.5711799209, true_card 1582060, qerror 0.5581201542166042, latency 0.01034402847290039
estimating query 1140: predicted 124687.71034507253, true_card 78162, qerror 1.5952471833508934, latency 0.0037369728088378906
estimating query 1141: predicted 175581.0, true_card 32941, qerror 5.330166054461006, latency 0.002699136734008789
estimating query 1142: predicted 2469519.3449018775, true_card 664704, qerror 3.715216615067575, latency 0.004213094711303711
estimating query 1143: predicted 45800.07352729846, true_card 8289, qerror 5.525403972409031, latency 0.0034699440002441406
estimating query 1144: predicted 9864749.0, true_card 15887427, qerror 0.6209154572354605, latency 0.003036022186279297
estimating query 1145: predicted 7007.822337282739, true_card 10595, qerror 0.6614273088516035, latency 0.003167867660522461
estimatin

estimating query 1201: predicted 547455.735757364, true_card 409126, qerror 1.3381103517189423, latency 0.0059320926666259766
estimating query 1202: predicted 12602052.812876116, true_card 2013844, qerror 6.257710534120873, latency 0.004827976226806641
estimating query 1203: predicted 2511027.5077886195, true_card 5517172, qerror 0.4551294590396347, latency 0.008497953414916992
estimating query 1204: predicted 2500204.3798603592, true_card 479480, qerror 5.214408066781428, latency 0.010951995849609375
estimating query 1205: predicted 657761.8695930053, true_card 5588393, qerror 0.11770143395301749, latency 0.006783008575439453
estimating query 1206: predicted 608655.2985344451, true_card 297636, qerror 2.044965321851003, latency 0.009835004806518555
estimating query 1207: predicted 1919228.473142796, true_card 922156, qerror 2.081240563573621, latency 0.010205984115600586
estimating query 1208: predicted 1002188.0850808781, true_card 12139474, qerror 0.08255613752958968, latency 0.0087

estimating query 1273: predicted 339038.87640907976, true_card 86270, qerror 3.9299742252124696, latency 0.006226778030395508
estimating query 1274: predicted 8079075.759769566, true_card 224227, qerror 36.03078915460478, latency 0.006038188934326172
estimating query 1275: predicted 1864945.500309435, true_card 922331, qerror 2.021991563017436, latency 0.008456230163574219
estimating query 1276: predicted 1862458.747852288, true_card 51472, qerror 36.18392034217221, latency 0.009472131729125977
estimating query 1277: predicted 662863.8126852768, true_card 4728606, qerror 0.14018165452678377, latency 0.008075714111328125
estimating query 1278: predicted 610934.095067298, true_card 103316, qerror 5.913257337365926, latency 0.009480953216552734
estimating query 1279: predicted 454005.317225433, true_card 175748, qerror 2.583274445373108, latency 0.01135396957397461
estimating query 1280: predicted 310927.50047333236, true_card 1760476, qerror 0.1766155860536198, latency 0.0091071128845214

estimating query 1344: predicted 63037.3900950523, true_card 492, qerror 128.12477661595997, latency 0.007589817047119141
estimating query 1345: predicted 263112.61067663494, true_card 215690, qerror 1.219864670020098, latency 0.0037708282470703125
estimating query 1346: predicted 12945341.92048309, true_card 14143, qerror 915.317960862836, latency 0.006217002868652344
estimating query 1347: predicted 2512418.0, true_card 3318431, qerror 0.7571102126275941, latency 0.00783991813659668
estimating query 1348: predicted 2451543.2874073028, true_card 180320, qerror 13.595515125373241, latency 0.007391929626464844
estimating query 1349: predicted 676416.0, true_card 48222, qerror 14.027124548961055, latency 0.008414983749389648
estimating query 1350: predicted 627936.4044658589, true_card 1548, qerror 405.6436721355678, latency 0.00769805908203125
estimating query 1351: predicted 1664928.7165206908, true_card 517083, qerror 3.2198481027624015, latency 0.008439064025878906
estimating query 1

estimating query 1409: predicted 2633932.7729343735, true_card 1771584, qerror 1.4867670812867881, latency 0.010249137878417969
estimating query 1410: predicted 144535562.99999997, true_card 263105194, qerror 0.5493451527984657, latency 0.002717256546020508
estimating query 1411: predicted 61083.090120931156, true_card 69018, qerror 0.8850312979357726, latency 0.017176151275634766
estimating query 1412: predicted 103603.5990586732, true_card 163513, qerror 0.6336107774835835, latency 0.016520977020263672
estimating query 1413: predicted 3260930.7951281173, true_card 2323452, qerror 1.40348532921193, latency 0.017730712890625
estimating query 1414: predicted 7541133.548632218, true_card 10993960, qerror 0.6859342355831946, latency 0.024061918258666992
estimating query 1415: predicted 118712.24577861164, true_card 170480, qerror 0.6963411882837379, latency 0.0055332183837890625
estimating query 1416: predicted 24799.82122470161, true_card 34664, qerror 0.7154344918273023, latency 0.02392

estimating query 1467: predicted 12994152.0, true_card 4254157, qerror 3.054459908273249, latency 0.007840156555175781
estimating query 1468: predicted 2512418.0, true_card 704085, qerror 3.5683447311049092, latency 0.0025517940521240234
estimating query 1469: predicted 676416.0, true_card 864796, qerror 0.7821682801493068, latency 0.005385875701904297
estimating query 1470: predicted 1039890.0, true_card 1330735, qerror 0.7814403318466863, latency 0.007302761077880859
estimating query 1471: predicted 12994152.0, true_card 4254157, qerror 3.054459908273249, latency 0.007483005523681641
estimating query 1472: predicted 1879233.3560215388, true_card 520457, qerror 3.6107370177008646, latency 0.006124973297119141
estimating query 1473: predicted 492022.5756746676, true_card 456127, qerror 1.0786964500559442, latency 0.0046367645263671875
estimating query 1474: predicted 94849.0117683234, true_card 133397, qerror 0.7110280723578746, latency 0.003059864044189453
estimating query 1475: predi

estimating query 1528: predicted 436888.02676085907, true_card 414167, qerror 1.054859577805231, latency 0.01194310188293457
estimating query 1529: predicted 94519.61712534676, true_card 113882, qerror 0.8299785490713788, latency 0.011337995529174805
estimating query 1530: predicted 198316.68085842906, true_card 266893, qerror 0.7430568836890779, latency 0.013174057006835938
estimating query 1531: predicted 433756.77823817753, true_card 408599, qerror 1.0615708267474406, latency 0.01578998565673828
estimating query 1532: predicted 136572.65612622304, true_card 143872, qerror 0.9492650142225244, latency 0.004534721374511719
estimating query 1533: predicted 580011.24196387, true_card 443526, qerror 1.3077277137391494, latency 0.005853891372680664
estimating query 1534: predicted 27246.1500873598, true_card 41935, qerror 0.6497233835068511, latency 0.004734039306640625
estimating query 1535: predicted 54003.0, true_card 79593, qerror 0.6784893143869436, latency 0.0034542083740234375
estim

estimating query 1584: predicted 1754274.389280887, true_card 2087379, qerror 0.8404196790716429, latency 0.02549004554748535
estimating query 1585: predicted 1921188.2473623655, true_card 3203614, qerror 0.599694047835465, latency 0.0050890445709228516
estimating query 1586: predicted 17503.387989367373, true_card 24368, qerror 0.7182939916844785, latency 0.0065708160400390625
estimating query 1587: predicted 912146.756239411, true_card 991738, qerror 0.9197456951729297, latency 0.024050235748291016
estimating query 1588: predicted 24235.99161855235, true_card 33690, qerror 0.7193823573331063, latency 0.025784969329833984
estimating query 1589: predicted 54003.0, true_card 77535, qerror 0.6964983555813503, latency 0.005983829498291016
estimating query 1590: predicted 235669477.60858002, true_card 537352263, qerror 0.4385753886898212, latency 0.026602983474731445
estimating query 1591: predicted 1754274.389280887, true_card 2087379, qerror 0.8404196790716429, latency 0.0282700061798095

estimating query 1674: predicted 778306.7502120471, true_card 428079, qerror 1.8181381245331987, latency 0.00996088981628418
estimating query 1675: predicted 126406.93024793467, true_card 52257, qerror 2.418947322807177, latency 0.0050318241119384766
estimating query 1676: predicted 43078.461103358844, true_card 64699, qerror 0.6658288552119638, latency 0.006802082061767578
estimating query 1677: predicted 995851.0, true_card 1260416, qerror 0.790097079059612, latency 0.008730173110961914
estimating query 1678: predicted 18769332.0905111, true_card 1957551, qerror 9.588170162877544, latency 0.009892940521240234
estimating query 1679: predicted 3584751.7952229665, true_card 185957, qerror 19.277315697838567, latency 0.00561213493347168
estimating query 1680: predicted 987917.9615031688, true_card 267713, qerror 3.690212882837848, latency 0.0070149898529052734
estimating query 1681: predicted 12904936.0, true_card 4040813, qerror 3.1936484068923754, latency 0.00830698013305664
estimating

estimating query 1738: predicted 30869292093.38499, true_card 76679643179, qerror 0.4025748009980185, latency 0.005700826644897461
estimating query 1739: predicted 3707999.2994642113, true_card 2697536, qerror 1.37458751225719, latency 0.014836788177490234
estimating query 1740: predicted 1553309.9196851226, true_card 609782, qerror 2.5473200581275317, latency 0.015318870544433594
estimating query 1741: predicted 1113733.1034961296, true_card 1155518, qerror 0.9638388181717028, latency 0.013612747192382812
estimating query 1742: predicted 163282555.05070704, true_card 43927632, qerror 3.717080744318452, latency 0.01583123207092285
estimating query 1743: predicted 12931724.599557169, true_card 22310877, qerror 0.5796152522178831, latency 0.004295825958251953
estimating query 1744: predicted 819700.9999999998, true_card 1331833, qerror 0.6154683057110011, latency 0.002875089645385742
estimating query 1745: predicted 1952.0, true_card 1192, qerror 1.6375838926174497, latency 0.01323103904

estimating query 1803: predicted 28858250.147380326, true_card 47376444, qerror 0.6091265555384513, latency 0.004998922348022461
estimating query 1804: predicted 191087.0, true_card 279018, qerror 0.6848554573540059, latency 0.002355813980102539
estimating query 1805: predicted 9769283.0, true_card 16203910, qerror 0.6028966465501228, latency 0.0028753280639648438
estimating query 1806: predicted 35093.69379677992, true_card 47100, qerror 0.7450890402713358, latency 0.0028018951416015625
estimating query 1807: predicted 2256080.1971353185, true_card 3115494, qerror 0.7241484647812895, latency 0.0031769275665283203
estimating query 1808: predicted 54920.0, true_card 79851, qerror 0.6877809920977821, latency 0.0009610652923583984
estimating query 1809: predicted 28858250.147380326, true_card 47376444, qerror 0.6091265555384513, latency 0.005471229553222656
estimating query 1810: predicted 4904387350.297022, true_card 11206879551, qerror 0.43762291974124046, latency 0.0058591365814208984


estimating query 1878: predicted 3126045.5654877676, true_card 61110, qerror 51.154402969853834, latency 0.00837397575378418
estimating query 1879: predicted 784062.5152942177, true_card 63711, qerror 12.306548559812557, latency 0.008095026016235352
estimating query 1880: predicted 9996790.776762264, true_card 878864, qerror 11.37467318807263, latency 0.008419036865234375
estimating query 1881: predicted 15106066.206368793, true_card 640157, qerror 23.597439700524703, latency 0.007211923599243164
estimating query 1882: predicted 618296.3770465668, true_card 184562, qerror 3.3500741054310574, latency 0.008255958557128906
estimating query 1883: predicted 15106066.206368793, true_card 490921, qerror 30.770869867797046, latency 0.0097198486328125
estimating query 1884: predicted 118154.08582938583, true_card 25695, qerror 4.598329862984465, latency 0.004759073257446289
estimating query 1885: predicted 183652.6458626166, true_card 31580, qerror 5.815473269873864, latency 0.00377798080444335

estimating query 1934: predicted 824738.3574174775, true_card 226973, qerror 3.6336408181478745, latency 0.00857400894165039
estimating query 1935: predicted 20502414.790183533, true_card 914210, qerror 22.42637336080718, latency 0.010792970657348633
estimating query 1936: predicted 124772.99999999997, true_card 167242, qerror 0.7460625919326483, latency 0.002128124237060547
estimating query 1937: predicted 163285.14832577817, true_card 30379, qerror 5.374934932873964, latency 0.0030100345611572266
estimating query 1938: predicted 2470878.888086837, true_card 697276, qerror 3.5436167143094512, latency 0.005377054214477539
estimating query 1939: predicted 51737.980908222766, true_card 52606, qerror 0.9834996180706149, latency 0.009464025497436523
estimating query 1940: predicted 6676.000000000001, true_card 10074, qerror 0.6626960492356563, latency 0.001934051513671875
estimating query 1941: predicted 210157.0193321722, true_card 298469, qerror 0.7041167402047522, latency 0.003874778747

estimating query 2006: predicted 143918954.0891415, true_card 96484, qerror 1491.6354430697472, latency 0.014741182327270508
estimating query 2007: predicted 144535562.99999997, true_card 263105194, qerror 0.5493451527984657, latency 0.0026569366455078125
estimating query 2008: predicted 9673781.794896679, true_card 15568550, qerror 0.6213669092431009, latency 0.0035371780395507812
estimating query 2009: predicted 271743.24423537985, true_card 508105, qerror 0.5348171032274429, latency 0.023799657821655273
estimating query 2010: predicted 57579.27774915889, true_card 50973, qerror 1.1296034714291663, latency 0.005066871643066406
estimating query 2011: predicted 9321635.938528532, true_card 15812416, qerror 0.5895137048335012, latency 0.0028269290924072266
estimating query 2012: predicted 259468.52355397682, true_card 599646, qerror 0.43270283392864595, latency 0.023209810256958008
estimating query 2013: predicted 89819.16252492665, true_card 128316, qerror 0.6999841214262185, latency 0

estimating query 2061: predicted 6679800.999999999, true_card 10238947, qerror 0.6523914031394048, latency 0.002972841262817383
estimating query 2062: predicted 95303.0, true_card 132712, qerror 0.7181189342335282, latency 0.002864837646484375
estimating query 2063: predicted 41632.20976029057, true_card 61634, qerror 0.6754747340800624, latency 0.004900693893432617
estimating query 2064: predicted 14225.351969351177, true_card 8897, qerror 1.598893106592242, latency 0.004819154739379883
estimating query 2065: predicted 595.6664683548505, true_card 866, qerror 0.6878365685390884, latency 0.0049479007720947266
estimating query 2066: predicted 9820274.0, true_card 16322646, qerror 0.601634931003221, latency 0.0014579296112060547
estimating query 2067: predicted 194175.0, true_card 281859, qerror 0.6889082839291987, latency 0.0012097358703613281
estimating query 2068: predicted 54920.0, true_card 79851, qerror 0.6877809920977821, latency 0.0008249282836914062
estimating query 2069: predic

estimating query 2132: predicted 24969.0, true_card 34789, qerror 0.7177268676880623, latency 0.0053789615631103516
estimating query 2133: predicted 54003.0, true_card 79206, qerror 0.6818044087569124, latency 0.0059931278228759766
estimating query 2134: predicted 945130.9999999998, true_card 1050170, qerror 0.8999790510107885, latency 0.004030942916870117
estimating query 2135: predicted 537.0000000000002, true_card 593, qerror 0.9055649241146716, latency 0.00861215591430664
estimating query 2136: predicted 1494308.9732415613, true_card 70752, qerror 21.12037784432329, latency 0.008706808090209961
estimating query 2137: predicted 1304652.459521293, true_card 78137, qerror 16.696986824696275, latency 0.009203910827636719
estimating query 2138: predicted 2245524.0396785876, true_card 2712537, qerror 0.8278316718550153, latency 0.007207155227661133
estimating query 2139: predicted 2702698.970703454, true_card 3665459, qerror 0.7373425731138867, latency 0.007781028747558594
estimating que

estimating query 2201: predicted 2999080.727212769, true_card 124141, qerror 24.158664157794515, latency 0.0059549808502197266
estimating query 2202: predicted 2485885.204128066, true_card 466494, qerror 5.328868547351233, latency 0.02513575553894043
estimating query 2203: predicted 10658481.353657369, true_card 1631222, qerror 6.534047084736087, latency 0.011124849319458008
estimating query 2204: predicted 148128.3972200296, true_card 17707, qerror 8.3655276003857, latency 0.0066149234771728516
estimating query 2205: predicted 9996790.776762264, true_card 2753908, qerror 3.6300380320483705, latency 0.024671077728271484
estimating query 2206: predicted 10682520.327078065, true_card 43872001, qerror 0.2434928903078313, latency 0.011096000671386719
estimating query 2207: predicted 1867256.2476142053, true_card 520248, qerror 3.589165643335881, latency 0.006663084030151367
estimating query 2208: predicted 10652106.644127823, true_card 45707592, qerror 0.23304895703382983, latency 0.029705

estimating query 2276: predicted 3175126.5913756485, true_card 211971, qerror 14.979061245998974, latency 0.003802776336669922
estimating query 2277: predicted 2721842.1734249536, true_card 1122938, qerror 2.423857927530241, latency 0.0020449161529541016
estimating query 2278: predicted 10855133.865634272, true_card 3452526, qerror 3.1441135752878537, latency 0.00940704345703125
estimating query 2279: predicted 158779.56753747456, true_card 29591, qerror 5.3658060740588205, latency 0.005717754364013672
estimating query 2280: predicted 12994152.0, true_card 4255848, qerror 3.05324626255449, latency 0.0034050941467285156
estimating query 2281: predicted 10872868.0, true_card 70129559, qerror 0.15503973153460154, latency 0.009186983108520508
estimating query 2282: predicted 2398000.6588536594, true_card 691208, qerror 3.469289503092643, latency 0.006033182144165039
estimating query 2283: predicted 10846177.542969456, true_card 81958485, qerror 0.1323374577137371, latency 0.007737874984741

estimating query 2348: predicted 500531.23190874996, true_card 253217, qerror 1.9766888949349766, latency 0.02544093132019043
estimating query 2349: predicted 6560647.291489274, true_card 5543342, qerror 1.1835184066740378, latency 0.009793996810913086
estimating query 2350: predicted 92739.31349551963, true_card 63198, qerror 1.4674406388733763, latency 0.009550094604492188
estimating query 2351: predicted 3294989.6201880327, true_card 136431, qerror 24.151326459441275, latency 0.007097959518432617
estimating query 2352: predicted 2687213.2299221866, true_card 520276, qerror 5.1649763393317905, latency 0.024857044219970703
estimating query 2353: predicted 10845791.943054449, true_card 1827833, qerror 5.93368865922349, latency 0.010057926177978516
estimating query 2354: predicted 161388.56748889745, true_card 19469, qerror 8.289514997632002, latency 0.006885051727294922
estimating query 2355: predicted 9699342.551246593, true_card 2763952, qerror 3.5092297374363204, latency 0.026454925

estimating query 2416: predicted 76467.24852392465, true_card 55484, qerror 1.3781855764531152, latency 0.00783991813659668
estimating query 2417: predicted 125857.18003851757, true_card 65334, qerror 1.9263657519594326, latency 0.003526926040649414
estimating query 2418: predicted 171841.45284697707, true_card 164951, qerror 1.0417727255183484, latency 0.007646083831787109
estimating query 2419: predicted 1039428.3713686675, true_card 987389, qerror 1.0527040217874288, latency 0.008306264877319336
estimating query 2420: predicted 54003.0, true_card 75910, qerror 0.7114082466078251, latency 0.002641916275024414
estimating query 2421: predicted 163285.14832577817, true_card 16583, qerror 9.846538522931807, latency 0.004189968109130859
estimating query 2422: predicted 2426569.6371417586, true_card 185793, qerror 13.060608511309676, latency 0.0049059391021728516
estimating query 2423: predicted 651548.45048051, true_card 146285, qerror 4.453966233588611, latency 0.00887298583984375
estima

estimating query 2474: predicted 2736817.5012277425, true_card 67369732, qerror 0.04062384426922973, latency 0.013869047164916992
estimating query 2475: predicted 26818513.342318006, true_card 289290682, qerror 0.09270438009585807, latency 0.015101909637451172
estimating query 2476: predicted 46481443.154860005, true_card 155696936, qerror 0.29853794396352157, latency 0.016985177993774414
estimating query 2477: predicted 39926166.8889359, true_card 1297422, qerror 30.773462211166375, latency 0.014890909194946289
estimating query 2478: predicted 3179897.961960524, true_card 12492036, qerror 0.25455401841305325, latency 0.011044740676879883
estimating query 2479: predicted 2736817.5012277425, true_card 21582838, qerror 0.12680526542560078, latency 0.015145063400268555
estimating query 2480: predicted 24270672.40716143, true_card 96498890, qerror 0.25151245166821534, latency 0.01573491096496582
estimating query 2481: predicted 46481443.154860005, true_card 524182774, qerror 0.088674114183

estimating query 2540: predicted 239652.87947577654, true_card 175223, qerror 1.367702182223661, latency 0.010298967361450195
estimating query 2541: predicted 49734.142963431324, true_card 38044, qerror 1.3072795437764515, latency 0.01175379753112793
estimating query 2542: predicted 115787.27484966909, true_card 146996, qerror 0.7876899701329906, latency 0.014509201049804688
estimating query 2543: predicted 708231.2170188845, true_card 90846, qerror 7.795953779130446, latency 0.009762763977050781
estimating query 2544: predicted 3819083.198135077, true_card 51304, qerror 74.4402619315273, latency 0.006876707077026367
estimating query 2545: predicted 1024651.7817532377, true_card 251465, qerror 4.074729213819966, latency 0.009722232818603516
estimating query 2546: predicted 195948.3839197432, true_card 33366, qerror 5.872696275242559, latency 0.01128697395324707
estimating query 2547: predicted 6200272.427053219, true_card 668362, qerror 9.276817693186056, latency 0.010223865509033203
e

In [None]:
for i in [50, 90, 95, 99, 100]:
    print(f"q-error {i}% percentile is {np.percentile(qerror, i)}")
print(f"total inference time: {np.sum(latency)}")

In [15]:

model_path = "/Users/ziniuw/Desktop/research/Learned_QO/data/saved_models/"
table_buckets = FJmodel.table_buckets
null_values = FJmodel.null_value
data, table_buckets, null_values = update_stats_data(data_folder, model_path, buckets, table_buckets,
                                                     null_values, False, after_data)

tags does not have data to update
updating equivalent key group: {'votes.PostId', 'postLinks.PostId', 'posts.Id', 'comments.PostId', 'tags.ExcerptPostId', 'postLinks.RelatedPostId', 'postHistory.PostId'}
tags.ExcerptPostId
updating equivalent key group: {'badges.UserId', 'users.Id', 'comments.UserId', 'postHistory.UserId', 'posts.OwnerUserId', 'votes.UserId'}


In [None]:
def test_trained_BN_on_stats(bn, t_name):
    queries = {
        "posts": "SELECT COUNT(*) FROM posts as p WHERE posts.CommentCount<=18 AND posts.CreationDate>='2010-07-23 07:27:31'::timestamp AND posts.CreationDate<='2014-09-09 01:43:00'::timestamp",
        "comments": "SELECT COUNT(*) FROM comments as c WHERE comments.CreationDate>='2010-08-05 00:36:02'::timestamp AND comments.CreationDate<='2014-09-08 16:50:49'::timestamp",
        "postHistory": "SELECT COUNT(*) FROM postHistory as ph WHERE postHistory.PostHistoryTypeId=1 AND postHistory.CreationDate>='2010-09-14 11:59:07'::timestamp",
        "votes": "SELECT COUNT(*) FROM votes as v WHERE votes.VoteTypeId=2 AND votes.CreationDate<='2014-09-10 00:00:00'::timestamp",
        "postLinks": "SELECT COUNT(*) FROM postLinks as pl WHERE postLinks.LinkTypeId=1 AND postLinks.CreationDate>='2011-09-03 21:00:10'::timestamp AND postLinks.CreationDate<='2014-07-30 21:29:52'::timestamp",
        "users": "SELECT COUNT(*) FROM users as u WHERE users.DownVotes>=0 AND users.DownVotes<=0 AND users.UpVotes>=0 AND users.UpVotes<=31 AND users.CreationDate<='2014-08-06 20:38:52'::timestamp",
        "badges": "SELECT COUNT(*) FROM badges as b WHERE badges.Date>='2010-09-26 12:17:14'::timestamp",
        "tags": "SELECT COUNT(*) FROM tags"
    }

    true_cards = {
        "posts": 90764,
        "comments": 172156,
        "postHistory": 42308,
        "votes": 261476,
        "postLinks": 8776,
        "users": 37062,
        "badges": 77704,
        "tags": 1032
    }

    bn.init_inference_method()
    bn.infer_algo = "exact-jit"
    query = parse_query_single_table(queries[t_name], bn)
    pred = bn.query(query)
    print(pred)
    assert min(pred, true_cards[t_name]) / max(pred, true_cards[t_name]) <= 1.5, f"Qerror too large, we have predition" \
                                                                        f"{pred} for true card {true_cards[t_name]}"

    query = parse_query_single_table(queries[t_name], bn)
    _, id_probs = bn.query_id_prob(query, bn.id_attributes)
    print(np.sum(id_probs))
    if t_name not in ['votes', 'tags']:
        assert min(pred, np.sum(id_probs)) / max(pred, np.sum(id_probs)) <= 1.5, "query_id_prob is incorrect"

In [None]:
[1,2,3].index(4)

In [None]:
t_name = "postHistory"
bn = FJmodel.bns[t_name]
bn.null_values = null_values[t_name]
print(len(data[t_name]))
bn.update_from_data(data[t_name])
test_trained_BN_on_stats(bn, t_name)

In [16]:
for table in FJmodel.schema.tables:
    t_name = table.table_name
    print(t_name)
    if t_name in data and data[t_name] is not None:
        bn = FJmodel.bns[t_name]
        bn.null_values = null_values[t_name]
        bn.update_from_data(data[t_name])
        #test_trained_BN_on_stats(bn, t_name)



badges
Discretizing table took 0.024923086166381836 secs.
old value indexing
(-1, 1) 120
-1 105
new_cpd.state_names:  1
new_index: 1
new_cpd.state_names:  105
new_index: 105
new_cpd.value:  (1, 105)
(-1, 1) 1
-1 105
old value indexing
-1 105
new_cpd.state_names:  105
new_index: 105
new_cpd.value:  (105,)
-1 105
done, incremental parameter updating took 0.09226179122924805 secs.
votes




Discretizing table took 0.23058009147644043 secs.
old value indexing
(-1, 1) 12
-1 12
new_cpd.state_names:  10
new_index: 10
new_cpd.state_names:  10
new_index: 10
new_cpd.value:  (10, 10)
(-1, 1) 10
-1 10
old value indexing
(-1, 1) 89
-1 46
new_cpd.state_names:  30
new_index: 30
new_cpd.state_names:  46
new_index: 46
new_cpd.value:  (30, 46)
(-1, 1) 30
-1 46
old value indexing
-1 46
new_cpd.state_names:  46
new_index: 46
new_cpd.value:  (46,)
-1 46
old value indexing
(-1, 1) 105
-1 89
new_cpd.state_names:  99
new_index: 99
new_cpd.state_names:  30
new_index: 30
new_cpd.value:  (99, 30)
(-1, 1) 99
-1 30
old value indexing
(-1, 1) 12
-1 105
new_cpd.state_names:  10
new_index: 10
new_cpd.state_names:  99
new_index: 99
new_cpd.value:  (10, 99)
(-1, 1) 10
-1 99
done, incremental parameter updating took 0.07032203674316406 secs.
postHistory
Discretizing table took 0.03910708427429199 secs.




old value indexing
(-1, 1) 120
-1 106
new_cpd.state_names:  4
new_index: 4
new_cpd.state_names:  103
new_index: 103
new_cpd.value:  (4, 103)
(-1, 1) 4
-1 103
old value indexing
-1 25
new_cpd.state_names:  25
new_index: 25
new_cpd.value:  (25,)
-1 25
old value indexing
(-1, 1) 46
-1 120
new_cpd.state_names:  45
new_index: 45
new_cpd.state_names:  4
new_index: 4
new_cpd.value:  (45, 4)
(-1, 1) 45
-1 4
old value indexing
(-1, 1) 106
-1 25
new_cpd.state_names:  103
new_index: 103
new_cpd.state_names:  25
new_index: 25
new_cpd.value:  (103, 25)
(-1, 1) 103
-1 25
done, incremental parameter updating took 0.17610597610473633 secs.
posts
Discretizing table took 0.12494802474975586 secs.




old value indexing
(-1, 1) 32
-1 53
new_cpd.state_names:  13
new_index: 13
new_cpd.state_names:  53
new_index: 53
new_cpd.value:  (13, 53)
(-1, 1) 13
-1 53
old value indexing
(-1, 1) 37
-1 46
new_cpd.state_names:  34
new_index: 34
new_cpd.state_names:  42
new_index: 42
new_cpd.value:  (34, 42)
(-1, 1) 34
-1 42
old value indexing
(-1, 1) 120
-1 106
new_cpd.state_names:  4
new_index: 4
new_cpd.state_names:  103
new_index: 103
new_cpd.value:  (4, 103)
(-1, 1) 4
-1 103
old value indexing
(-1, 1) 60
-1 46
new_cpd.state_names:  21
new_index: 21
new_cpd.state_names:  42
new_index: 42
new_cpd.value:  (21, 42)
(-1, 1) 21
-1 42
old value indexing
-1 46
new_cpd.state_names:  42
new_index: 42
new_cpd.value:  (42,)
-1 42
old value indexing
(-1, 1) 106
-1 53
new_cpd.state_names:  103
new_index: 103
new_cpd.state_names:  53
new_index: 53
new_cpd.value:  (103, 53)
(-1, 1) 103
-1 53
old value indexing
(-1, 1) 7
-1 32
new_cpd.state_names:  4
new_index: 4
new_cpd.state_names:  13
new_index: 13
new_cpd.va



Discretizing table took 0.32356691360473633 secs.
old value indexing
(-1, 1) 120
-1 84
new_cpd.state_names:  6
new_index: 6
new_cpd.state_names:  96
new_index: 96
new_cpd.value:  (6, 96)
(-1, 1) 6
-1 96
old value indexing
(-1, 1) 54
-1 81
new_cpd.state_names:  11
new_index: 11
new_cpd.state_names:  57
new_index: 57
new_cpd.value:  (11, 57)
(-1, 1) 11
-1 57
old value indexing
-1 106
new_cpd.state_names:  17
new_index: 17
new_cpd.value:  (17,)
-1 17
old value indexing
(-1, 1) 84
-1 106
new_cpd.state_names:  96
new_index: 96
new_cpd.state_names:  17
new_index: 17
new_cpd.value:  (96, 17)
(-1, 1) 96
-1 17
old value indexing
(-1, 1) 81
-1 84
new_cpd.state_names:  57
new_index: 57
new_cpd.state_names:  96
new_index: 96
new_cpd.value:  (57, 96)
(-1, 1) 57
-1 96
old value indexing
(-1, 1) 80
-1 84
new_cpd.state_names:  61
new_index: 61
new_cpd.state_names:  96
new_index: 96
new_cpd.value:  (61, 96)
(-1, 1) 61
-1 96
done, incremental parameter updating took 0.09523582458496094 secs.
comments
Di



old value indexing
(-1, 1) 120
-1 46
new_cpd.state_names:  39
new_index: 39
new_cpd.state_names:  46
new_index: 46
new_cpd.value:  (39, 46)
(-1, 1) 39
-1 46
old value indexing
-1 46
new_cpd.state_names:  46
new_index: 46
new_cpd.value:  (46,)
-1 46
old value indexing
(-1, 1) 33
-1 106
new_cpd.state_names:  23
new_index: 23
new_cpd.state_names:  104
new_index: 104
new_cpd.value:  (23, 104)
(-1, 1) 23
-1 104
old value indexing
(-1, 1) 106
-1 120
new_cpd.state_names:  104
new_index: 104
new_cpd.state_names:  39
new_index: 39
new_cpd.value:  (104, 39)
(-1, 1) 104
-1 39
done, incremental parameter updating took 0.171644926071167 secs.
postLinks
Discretizing table took 0.009223222732543945 secs.
old value indexing
-1 117
new_cpd.state_names:  1
new_index: 1
new_cpd.value:  (1,)
-1 1
old value indexing
(-1, 1) 2
-1 117
new_cpd.state_names:  2
new_index: 2
new_cpd.state_names:  1
new_index: 1
new_cpd.value:  (2, 1)
(-1, 1) 2
-1 1
old value indexing
(-1, 1) 46
-1 117
new_cpd.state_names:  42
ne

In [None]:
a = [1,2,2,3,]
a.remove(1)
a

In [None]:
np.ones(1).astype(int)

In [None]:
a = np.arange(10)
a = a.delete(0)
a

In [None]:
import numpy as np
import pickle
import time
import os
import sys
sys.path.append("/home/ubuntu/CE_scheme")
from Schemas.stats.schema import gen_stats_light_schema
from Evaluation.training import train_one_stats, test_trained_BN_on_stats
from Join_scheme.data_prepare import read_table_csv, update_stats_data
from BayesCard.Models.Bayescard_BN import Bayescard_BN


def timestamp_transorform(time_string, start_date="2010-07-19 00:00:00"):
    start_date_int = time.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    time_array = time.strptime(time_string, "%Y-%m-%d %H:%M:%S")
    return int(time.mktime(time_array)) - int(time.mktime(start_date_int))


def get_data_by_date(data_path, time_date="2014-01-01 00:00:00"):
    time_value = timestamp_transorform(time_date)
    if not data_path.endswith(".csv"):
        data_path += "/{}.csv"
    schema = gen_stats_light_schema(data_path)
    before_data = dict()
    after_data = dict()
    for table_obj in schema.tables:
        table_name = table_obj.table_name
        df_rows = read_table_csv(table_obj)
        idx = len(df_rows)
        for attribute in df_rows.columns:
            if "Date" in attribute:
                idx = np.searchsorted(df_rows[attribute].values, time_value)
                break

        before_data[table_name] = df_rows[:idx] if idx > 0 else None
        after_data[table_name] = df_rows[idx:] if idx < len(df_rows) else None
    return before_data, after_data


def update_one_stats(FJmodel, buckets, table_buckets, data_path, save_model_folder, save_bucket_bins=False,
                     update_BN=True, retrain_BN=False, old_data=None, validate=False):
    """
    Incrementally update the FactorJoin model
    """
    data, table_buckets, null_values = update_stats_data(data_path, save_model_folder, buckets, table_buckets,
                                                         save_bucket_bins)
    FJmodel.table_buckets = table_buckets
    if update_BN:
        # updating the single table estimator
        if retrain_BN:
            # retrain the BN based on the new and old data
            for table in FJmodel.schema.tables:
                t_name = table.table_name
                if t_name in data and data[t_name] is not None:
                    bn = Bayescard_BN(t_name, table_buckets[t_name].id_attributes, table_buckets[t_name].bin_sizes,
                                      null_values=null_values[t_name])
                    new_data = old_data[t_name].append(data[t_name], ignore_index=True)
                    bn.build_from_data(new_data)
                    if validate:
                        test_trained_BN_on_stats(bn, t_name)
                    FJmodel.bns[t_name] = bn
        else:
            # incrementally update BN
            for table in FJmodel.schema.tables:
                t_name = table.table_name
                if t_name in data and data[t_name] is not None:
                    bn = FJmodel.bns[t_name]
                    bn.null_values = null_values[t_name]
                    bn.update_from_data(data)

    model_path = save_model_folder + f"update_model.pkl"
    pickle.dump(FJmodel, open(model_path, 'wb'), pickle.HIGHEST_PROTOCOL)
    print(f"models save at {model_path}")


def eval_update(data_folder, model_path, n_dim_dist, bin_size, bucket_method, split_date="2014-01-01 00:00:00"):
    before_data, after_data = get_data_by_date(data_folder, split_date)
    print("************************************************************")
    print(f"Training the model with data before {split_date}")
    start_time = time.time()
    train_one_stats("stats", data_folder, model_path, n_dim_dist, bin_size, bucket_method, True, actual_data=before_data)
    print(f"training completed, took {time.time() - start_time} sec")

    #loading the trained model and buckets
    with open(model_path + "buckets.pkl", "rb") as f:
        buckets = pickle.load(f)
    with open(model_path + f"model_stats_{bucket_method}_{bin_size}.pkl", "rb") as f:
        FJmodel = pickle.load(f)
    print("************************************************************")
    print(f"Updating the model with data after {split_date}")
    start_time = time.time()
    table_buckets = FJmodel.table_buckets
    null_values = FJmodel.null_value
    data, table_buckets, null_values = update_stats_data(data_folder, model_path, buckets, table_buckets,
                                                         null_values, False, after_data)
    for table in FJmodel.schema.tables:
        t_name = table.table_name
        if t_name in data and data[t_name] is not None:
            bn = FJmodel.bns[t_name]
            bn.null_values = null_values[t_name]
            bn.update_from_data(data[t_name])
            #test_trained_BN_on_stats(bn, t_name)
    print(f"updating completed, took {time.time() - start_time} sec")
    model_path = model_path + f"updated_model_stats_{bucket_method}_{bin_size}.pkl"
    pickle.dump(FJmodel, open(model_path, 'wb'), pickle.HIGHEST_PROTOCOL)
    print(f"updated models save at {model_path}")

In [None]:
data_path = "/Users/ziniuw/Desktop/research/Learned_QO/data/stats_simplified/{}.csv"
model_path = "/Users/ziniuw/Desktop/research/Learned_QO/data/saved_models"
eval_update(data_path, model_path, 2, 200, "sub_optimal")