In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy import stats
import time
import sys
sys.path.append("/home/ubuntu/CE_scheme")
from Schemas.stats.schema import gen_stats_light_schema
from Evaluation.training import train_one_stats, test_trained_BN_on_stats
from Join_scheme.data_prepare import process_stats_data, update_stats_data
from BayesCard.Models.Bayescard_BN import Bayescard_BN
from BayesCard.Evaluation.cardinality_estimation import parse_query_single_table

In [2]:
data_folder = '/home/ubuntu/End-to-End-CardEst-Benchmark/datasets/stats_simplified'

In [3]:
def read_table_csv(table_obj, csv_seperator=',', stats=True):
    """
    Reads csv from path, renames columns and drops unnecessary columns
    """
    if stats:
        df_rows = pd.read_csv(table_obj.csv_file_location)
    else:
        df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8',
                              quotechar='"',
                              sep=csv_seperator)
    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    return df_rows.apply(pd.to_numeric, errors="ignore")


def timestamp_transorform(time_string, start_date="2010-07-19 00:00:00"):
    start_date_int = time.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    time_array = time.strptime(time_string, "%Y-%m-%d %H:%M:%S")
    return int(time.mktime(time_array)) - int(time.mktime(start_date_int))


def get_data_by_date(data_path, time_date="2014-01-01 00:00:00"):
    time_value = timestamp_transorform(time_date)
    if not data_path.endswith(".csv"):
        data_path += "/{}.csv"
    schema = gen_stats_light_schema(data_path)
    before_data = dict()
    after_data = dict()
    for table_obj in schema.tables:
        table_name = table_obj.table_name
        df_rows = read_table_csv(table_obj)
        idx = len(df_rows)
        for attribute in df_rows.columns:
            if "Date" in attribute:
                idx = np.searchsorted(df_rows[attribute].values, time_value)
                break
                
        before_data[table_name] = df_rows[:idx] if idx > 0 else None
        after_data[table_name] = df_rows[idx:] if idx < len(df_rows) else None
    return before_data, after_data


In [4]:
before_data, after_data = get_data_by_date(data_folder)

In [None]:
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
data, null_values, key_attrs, table_buckets, equivalent_keys, schema, bin_size = process_stats_data(data_folder,
                                        model_path, 200, "sub_optimal", False, data=before_data)


In [None]:
t_name = "postHistory"
bn = Bayescard_BN(t_name, key_attrs[t_name], bin_size[t_name], null_values=null_values[t_name])
bn.build_from_data(data[t_name])

In [5]:
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
train_one_stats("stats", data_folder, model_path, 200, "sub_optimal", True, actual_data=before_data)

bucketizing equivalent key group: {'postHistory.PostId', 'postLinks.RelatedPostId', 'posts.Id', 'postLinks.PostId', 'tags.ExcerptPostId', 'comments.PostId', 'votes.PostId'}
bucketizing equivalent key group: {'votes.UserId', 'badges.UserId', 'posts.OwnerUserId', 'postHistory.UserId', 'users.Id', 'comments.UserId'}
badges
Discretizing table takes 0.020223140716552734 secs
Structure learning took 1.2931559085845947 secs.
done, parameter learning took 0.07734131813049316 secs.
votes
Discretizing table takes 2.4802844524383545 secs
Structure learning took 11.316552639007568 secs.
done, parameter learning took 0.10006475448608398 secs.
postHistory
Discretizing table takes 0.09330105781555176 secs
Structure learning took 12.711439609527588 secs.
done, parameter learning took 0.21827101707458496 secs.
posts
Discretizing table takes 0.4853363037109375 secs
Structure learning took 11.590935230255127 secs.
done, parameter learning took 0.2164306640625 secs.
users
Discretizing table takes 2.479881

In [6]:
with open(model_path + "buckets.pkl", "rb") as f:
    buckets = pickle.load(f)
with open(model_path + "model_stats_sub_optimal_200.pkl", "rb") as f:
    FJmodel = pickle.load(f)

In [None]:
query_file = "/home/ubuntu/End-to-End-CardEst-Benchmark/workloads/stats_CEB/sub_plan_queries/stats_CEB_sub_queries.sql"
with open(query_file, "r") as f:
    queries = f.readlines()


In [None]:
qerror = []
latency = []
pred = []
for i, query_str in enumerate(queries):
    query = query_str.split("||")[0][:-1]
    print("========================")
    true_card = int(query_str.split("||")[-1])
    t = time.time()
    res = FJmodel.get_cardinality_bound(query)
    pred.append(res)
    latency.append(time.time() - t)
    qerror.append(res/true_card)
    print(f"estimating query {i}: predicted {res}, true_card {true_card}, qerror {res/true_card}, latency {time.time() - t}")

In [None]:
for i in [50, 90, 95, 99, 100]:
    print(f"q-error {i}% percentile is {np.percentile(qerror, i)}")
print(f"total inference time: {np.sum(latency)}")

In [7]:
table_buckets = FJmodel.table_buckets
null_values = FJmodel.null_value
data, table_buckets, null_values = update_stats_data(data_folder, model_path, buckets, table_buckets,
                                                     null_values, False, after_data)

tags does not have data to update
updating equivalent key group: {'postHistory.PostId', 'postLinks.RelatedPostId', 'posts.Id', 'postLinks.PostId', 'tags.ExcerptPostId', 'comments.PostId', 'votes.PostId'}
tags.ExcerptPostId
updating equivalent key group: {'votes.UserId', 'badges.UserId', 'posts.OwnerUserId', 'postHistory.UserId', 'users.Id', 'comments.UserId'}


In [None]:
def test_trained_BN_on_stats(bn, t_name):
    queries = {
        "posts": "SELECT COUNT(*) FROM posts as p WHERE posts.CommentCount<=18 AND posts.CreationDate>='2010-07-23 07:27:31'::timestamp AND posts.CreationDate<='2014-09-09 01:43:00'::timestamp",
        "comments": "SELECT COUNT(*) FROM comments as c WHERE comments.CreationDate>='2010-08-05 00:36:02'::timestamp AND comments.CreationDate<='2014-09-08 16:50:49'::timestamp",
        "postHistory": "SELECT COUNT(*) FROM postHistory as ph WHERE postHistory.PostHistoryTypeId=1 AND postHistory.CreationDate>='2010-09-14 11:59:07'::timestamp",
        "votes": "SELECT COUNT(*) FROM votes as v WHERE votes.VoteTypeId=2 AND votes.CreationDate<='2014-09-10 00:00:00'::timestamp",
        "postLinks": "SELECT COUNT(*) FROM postLinks as pl WHERE postLinks.LinkTypeId=1 AND postLinks.CreationDate>='2011-09-03 21:00:10'::timestamp AND postLinks.CreationDate<='2014-07-30 21:29:52'::timestamp",
        "users": "SELECT COUNT(*) FROM users as u WHERE users.DownVotes>=0 AND users.DownVotes<=0 AND users.UpVotes>=0 AND users.UpVotes<=31 AND users.CreationDate<='2014-08-06 20:38:52'::timestamp",
        "badges": "SELECT COUNT(*) FROM badges as b WHERE badges.Date>='2010-09-26 12:17:14'::timestamp",
        "tags": "SELECT COUNT(*) FROM tags"
    }

    true_cards = {
        "posts": 90764,
        "comments": 172156,
        "postHistory": 42308,
        "votes": 261476,
        "postLinks": 8776,
        "users": 37062,
        "badges": 77704,
        "tags": 1032
    }

    bn.init_inference_method()
    bn.infer_algo = "exact-jit"
    query = parse_query_single_table(queries[t_name], bn)
    pred = bn.query(query)
    print(pred)
    assert min(pred, true_cards[t_name]) / max(pred, true_cards[t_name]) <= 1.5, f"Qerror too large, we have predition" \
                                                                        f"{pred} for true card {true_cards[t_name]}"

    query = parse_query_single_table(queries[t_name], bn)
    _, id_probs = bn.query_id_prob(query, bn.id_attributes)
    print(np.sum(id_probs))
    if t_name not in ['votes', 'tags']:
        assert min(pred, np.sum(id_probs)) / max(pred, np.sum(id_probs)) <= 1.5, "query_id_prob is incorrect"

In [None]:
t_name = "postHistory"
bn = FJmodel.bns[t_name]
bn.null_values = null_values[t_name]
print(len(data[t_name]))
bn.update_from_data(data[t_name])
test_trained_BN_on_stats(bn, t_name)

In [None]:
pred

In [8]:
for table in FJmodel.schema.tables:
    t_name = table.table_name
    print(t_name)
    if t_name in data and data[t_name] is not None:
        bn = FJmodel.bns[t_name]
        bn.null_values = null_values[t_name]
        bn.update_from_data(data[t_name])
        #test_trained_BN_on_stats(bn, t_name)

badges
Discretizing table took 0.03995180130004883 secs.




done, incremental parameter updating took 0.17960119247436523 secs.
votes




Discretizing table took 0.6610567569732666 secs.
done, incremental parameter updating took 0.1657705307006836 secs.
postHistory
Discretizing table took 0.0810699462890625 secs.




done, incremental parameter updating took 0.30565738677978516 secs.
posts
Discretizing table took 0.41088366508483887 secs.




done, incremental parameter updating took 0.4997742176055908 secs.
users




Discretizing table took 0.9261248111724854 secs.
done, incremental parameter updating took 0.19570565223693848 secs.
comments




Discretizing table took 0.06556344032287598 secs.




done, incremental parameter updating took 0.2619166374206543 secs.
postLinks
Discretizing table took 0.024054527282714844 secs.
done, incremental parameter updating took 0.1285252571105957 secs.
tags


In [None]:
a = [1,2,2,3,]
a.remove(1)
a

In [None]:
np.ones(1).astype(int)

In [None]:
a = np.arange(10)
a = a.delete(0)
a

In [9]:
import numpy as np
import pickle
import time
import os
import sys
sys.path.append("/home/ubuntu/CE_scheme")
from Schemas.stats.schema import gen_stats_light_schema
from Evaluation.training import train_one_stats, test_trained_BN_on_stats
from Join_scheme.data_prepare import read_table_csv, update_stats_data
from BayesCard.Models.Bayescard_BN import Bayescard_BN


def timestamp_transorform(time_string, start_date="2010-07-19 00:00:00"):
    start_date_int = time.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    time_array = time.strptime(time_string, "%Y-%m-%d %H:%M:%S")
    return int(time.mktime(time_array)) - int(time.mktime(start_date_int))


def get_data_by_date(data_path, time_date="2014-01-01 00:00:00"):
    time_value = timestamp_transorform(time_date)
    if not data_path.endswith(".csv"):
        data_path += "/{}.csv"
    schema = gen_stats_light_schema(data_path)
    before_data = dict()
    after_data = dict()
    for table_obj in schema.tables:
        table_name = table_obj.table_name
        df_rows = read_table_csv(table_obj)
        idx = len(df_rows)
        for attribute in df_rows.columns:
            if "Date" in attribute:
                idx = np.searchsorted(df_rows[attribute].values, time_value)
                break

        before_data[table_name] = df_rows[:idx] if idx > 0 else None
        after_data[table_name] = df_rows[idx:] if idx < len(df_rows) else None
    return before_data, after_data


def update_one_stats(FJmodel, buckets, table_buckets, data_path, save_model_folder, save_bucket_bins=False,
                     update_BN=True, retrain_BN=False, old_data=None, validate=False):
    """
    Incrementally update the FactorJoin model
    """
    data, table_buckets, null_values = update_stats_data(data_path, save_model_folder, buckets, table_buckets,
                                                         save_bucket_bins)
    FJmodel.table_buckets = table_buckets
    if update_BN:
        # updating the single table estimator
        if retrain_BN:
            # retrain the BN based on the new and old data
            for table in FJmodel.schema.tables:
                t_name = table.table_name
                if t_name in data and data[t_name] is not None:
                    bn = Bayescard_BN(t_name, table_buckets[t_name].id_attributes, table_buckets[t_name].bin_sizes,
                                      null_values=null_values[t_name])
                    new_data = old_data[t_name].append(data[t_name], ignore_index=True)
                    bn.build_from_data(new_data)
                    if validate:
                        test_trained_BN_on_stats(bn, t_name)
                    FJmodel.bns[t_name] = bn
        else:
            # incrementally update BN
            for table in FJmodel.schema.tables:
                t_name = table.table_name
                if t_name in data and data[t_name] is not None:
                    bn = FJmodel.bns[t_name]
                    bn.null_values = null_values[t_name]
                    bn.update_from_data(data)

    model_path = save_model_folder + f"update_model.pkl"
    pickle.dump(FJmodel, open(model_path, 'wb'), pickle.HIGHEST_PROTOCOL)
    print(f"models save at {model_path}")


def eval_update(data_folder, model_path, bin_size, bucket_method, split_date="2014-01-01 00:00:00"):
    before_data, after_data = get_data_by_date(data_folder, split_date)
    print("************************************************************")
    print(f"Training the model with data before {split_date}")
    start_time = time.time()
    train_one_stats("stats", data_folder, model_path, bin_size, bucket_method, True, actual_data=before_data)
    print(f"training completed, took {time.time() - start_time} sec")

    #loading the trained model and buckets
    with open(model_path + "buckets.pkl", "rb") as f:
        buckets = pickle.load(f)
    with open(model_path + f"model_stats_{bucket_method}_{bin_size}.pkl", "rb") as f:
        FJmodel = pickle.load(f)
    print("************************************************************")
    print(f"Updating the model with data after {split_date}")
    start_time = time.time()
    table_buckets = FJmodel.table_buckets
    null_values = FJmodel.null_value
    data, table_buckets, null_values = update_stats_data(data_folder, model_path, buckets, table_buckets,
                                                         null_values, False, after_data)
    for table in FJmodel.schema.tables:
        t_name = table.table_name
        if t_name in data and data[t_name] is not None:
            bn = FJmodel.bns[t_name]
            bn.null_values = null_values[t_name]
            bn.update_from_data(data[t_name])
            #test_trained_BN_on_stats(bn, t_name)
    print(f"updating completed, took {time.time() - start_time} sec")
    model_path = model_path + f"updated_model_stats_{bucket_method}_{bin_size}.pkl"
    pickle.dump(FJmodel, open(model_path, 'wb'), pickle.HIGHEST_PROTOCOL)
    print(f"updated models save at {model_path}")

In [11]:
data_path = "/home/ubuntu/End-to-End-CardEst-Benchmark/datasets/stats_simplified"
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
eval_update(data_path, model_path, 200, "sub_optimal")

************************************************************
Training the model with data before 2014-01-01 00:00:00
bucketizing equivalent key group: {'postHistory.PostId', 'postLinks.RelatedPostId', 'posts.Id', 'postLinks.PostId', 'tags.ExcerptPostId', 'comments.PostId', 'votes.PostId'}
bucketizing equivalent key group: {'votes.UserId', 'badges.UserId', 'posts.OwnerUserId', 'postHistory.UserId', 'users.Id', 'comments.UserId'}
badges
Discretizing table takes 0.015857696533203125 secs
Structure learning took 1.1505992412567139 secs.
done, parameter learning took 0.06596612930297852 secs.
votes
Discretizing table takes 2.3427138328552246 secs
Structure learning took 10.038585186004639 secs.
done, parameter learning took 0.18168926239013672 secs.
postHistory
Discretizing table takes 0.1303114891052246 secs
Structure learning took 13.11159086227417 secs.
done, parameter learning took 0.2548544406890869 secs.
posts
Discretizing table takes 0.6537225246429443 secs
Structure learning took 10



Discretizing table took 0.01146245002746582 secs.
done, incremental parameter updating took 0.09986567497253418 secs.




Discretizing table took 0.45528507232666016 secs.
done, incremental parameter updating took 0.0971364974975586 secs.
Discretizing table took 0.057456016540527344 secs.




done, incremental parameter updating took 0.2699098587036133 secs.




Discretizing table took 0.21903753280639648 secs.
done, incremental parameter updating took 0.24938750267028809 secs.




Discretizing table took 0.5899736881256104 secs.
done, incremental parameter updating took 0.1284177303314209 secs.
Discretizing table took 0.04863405227661133 secs.




done, incremental parameter updating took 0.2456510066986084 secs.
Discretizing table took 0.014635086059570312 secs.
done, incremental parameter updating took 0.09695196151733398 secs.
updating completed, took 4.804568529129028 sec
updated models save at /home/ubuntu/data_CE/CE_scheme_models/update/updated_model_stats_sub_optimal_200.pkl
