In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy import stats
import time
import sys
sys.path.append("/home/ubuntu/CE_scheme")
from Schemas.stats.schema import gen_stats_light_schema
from Evaluation.training import train_one_stats, test_trained_BN_on_stats
from Join_scheme.data_prepare import process_stats_data, update_stats_data
from BayesCard.Models.Bayescard_BN import Bayescard_BN
from BayesCard.Evaluation.cardinality_estimation import parse_query_single_table

In [2]:
data_folder = '/home/ubuntu/End-to-End-CardEst-Benchmark/datasets/stats_simplified'

In [3]:
def read_table_csv(table_obj, csv_seperator=',', stats=True):
    """
    Reads csv from path, renames columns and drops unnecessary columns
    """
    if stats:
        df_rows = pd.read_csv(table_obj.csv_file_location)
    else:
        df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8',
                              quotechar='"',
                              sep=csv_seperator)
    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    return df_rows.apply(pd.to_numeric, errors="ignore")


def timestamp_transorform(time_string, start_date="2010-07-19 00:00:00"):
    start_date_int = time.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    time_array = time.strptime(time_string, "%Y-%m-%d %H:%M:%S")
    return int(time.mktime(time_array)) - int(time.mktime(start_date_int))


def get_data_by_date(data_path, time_date="2014-01-01 00:00:00"):
    time_value = timestamp_transorform(time_date)
    if not data_path.endswith(".csv"):
        data_path += "/{}.csv"
    schema = gen_stats_light_schema(data_path)
    before_data = dict()
    after_data = dict()
    for table_obj in schema.tables:
        table_name = table_obj.table_name
        df_rows = read_table_csv(table_obj)
        idx = len(df_rows)
        for attribute in df_rows.columns:
            if "Date" in attribute:
                idx = np.searchsorted(df_rows[attribute].values, time_value)
                break
                
        before_data[table_name] = df_rows[:idx] if idx > 0 else None
        after_data[table_name] = df_rows[idx:] if idx < len(df_rows) else None
    return before_data, after_data


In [4]:
before_data, after_data = get_data_by_date(data_folder)

In [5]:
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
data, null_values, key_attrs, table_buckets, equivalent_keys, schema, bin_size = process_stats_data(data_folder,
                                        model_path, 200, "sub_optimal", False, data=before_data)


bucketizing equivalent key group: {'votes.PostId', 'tags.ExcerptPostId', 'postHistory.PostId', 'postLinks.RelatedPostId', 'posts.Id', 'comments.PostId', 'postLinks.PostId'}
bucketizing equivalent key group: {'badges.UserId', 'posts.OwnerUserId', 'votes.UserId', 'postHistory.UserId', 'comments.UserId', 'users.Id'}


In [6]:
t_name = "postHistory"
bn = Bayescard_BN(t_name, key_attrs[t_name], bin_size[t_name], null_values=null_values[t_name])
bn.build_from_data(data[t_name])

Discretizing table takes 0.10353326797485352 secs
Structure learning took 12.671696662902832 secs.
done, parameter learning took 0.24830865859985352 secs.


In [None]:
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
train_one_stats("stats", data_folder, model_path, 200, "sub_optimal", True, actual_data=before_data)

In [7]:
with open(model_path + "buckets.pkl", "rb") as f:
    buckets = pickle.load(f)
with open(model_path + "model_stats_sub_optimal_200.pkl", "rb") as f:
    FJmodel = pickle.load(f)

In [None]:
query_file = "/home/ubuntu/End-to-End-CardEst-Benchmark/workloads/stats_CEB/sub_plan_queries/stats_CEB_sub_queries.sql"
with open(query_file, "r") as f:
    queries = f.readlines()


In [None]:
qerror = []
latency = []
pred = []
for i, query_str in enumerate(queries):
    query = query_str.split("||")[0][:-1]
    print("========================")
    true_card = int(query_str.split("||")[-1])
    t = time.time()
    res = FJmodel.get_cardinality_bound(query)
    pred.append(res)
    latency.append(time.time() - t)
    qerror.append(res/true_card)
    print(f"estimating query {i}: predicted {res}, true_card {true_card}, qerror {res/true_card}, latency {time.time() - t}")

In [None]:
for i in [50, 90, 95, 99, 100]:
    print(f"q-error {i}% percentile is {np.percentile(qerror, i)}")
print(f"total inference time: {np.sum(latency)}")

In [8]:
table_buckets = FJmodel.table_buckets
null_values = FJmodel.null_value
data, table_buckets, null_values = update_stats_data(data_folder, model_path, buckets, table_buckets,
                                                     null_values, False, after_data)

tags does not have data to update
updating equivalent key group: {'votes.PostId', 'tags.ExcerptPostId', 'postHistory.PostId', 'postLinks.RelatedPostId', 'posts.Id', 'comments.PostId', 'postLinks.PostId'}
tags.ExcerptPostId
updating equivalent key group: {'badges.UserId', 'posts.OwnerUserId', 'votes.UserId', 'postHistory.UserId', 'comments.UserId', 'users.Id'}


In [9]:
def test_trained_BN_on_stats(bn, t_name):
    queries = {
        "posts": "SELECT COUNT(*) FROM posts as p WHERE posts.CommentCount<=18 AND posts.CreationDate>='2010-07-23 07:27:31'::timestamp AND posts.CreationDate<='2014-09-09 01:43:00'::timestamp",
        "comments": "SELECT COUNT(*) FROM comments as c WHERE comments.CreationDate>='2010-08-05 00:36:02'::timestamp AND comments.CreationDate<='2014-09-08 16:50:49'::timestamp",
        "postHistory": "SELECT COUNT(*) FROM postHistory as ph WHERE postHistory.PostHistoryTypeId=1 AND postHistory.CreationDate>='2010-09-14 11:59:07'::timestamp",
        "votes": "SELECT COUNT(*) FROM votes as v WHERE votes.VoteTypeId=2 AND votes.CreationDate<='2014-09-10 00:00:00'::timestamp",
        "postLinks": "SELECT COUNT(*) FROM postLinks as pl WHERE postLinks.LinkTypeId=1 AND postLinks.CreationDate>='2011-09-03 21:00:10'::timestamp AND postLinks.CreationDate<='2014-07-30 21:29:52'::timestamp",
        "users": "SELECT COUNT(*) FROM users as u WHERE users.DownVotes>=0 AND users.DownVotes<=0 AND users.UpVotes>=0 AND users.UpVotes<=31 AND users.CreationDate<='2014-08-06 20:38:52'::timestamp",
        "badges": "SELECT COUNT(*) FROM badges as b WHERE badges.Date>='2010-09-26 12:17:14'::timestamp",
        "tags": "SELECT COUNT(*) FROM tags"
    }

    true_cards = {
        "posts": 90764,
        "comments": 172156,
        "postHistory": 42308,
        "votes": 261476,
        "postLinks": 8776,
        "users": 37062,
        "badges": 77704,
        "tags": 1032
    }

    bn.init_inference_method()
    bn.infer_algo = "exact-jit"
    query = parse_query_single_table(queries[t_name], bn)
    pred = bn.query(query)
    print(pred)
    assert min(pred, true_cards[t_name]) / max(pred, true_cards[t_name]) <= 1.5, f"Qerror too large, we have predition" \
                                                                        f"{pred} for true card {true_cards[t_name]}"

    query = parse_query_single_table(queries[t_name], bn)
    _, id_probs = bn.query_id_prob(query, bn.id_attributes)
    print(np.sum(id_probs))
    if t_name not in ['votes', 'tags']:
        assert min(pred, np.sum(id_probs)) / max(pred, np.sum(id_probs)) <= 1.5, "query_id_prob is incorrect"

In [10]:
t_name = "postHistory"
bn = FJmodel.bns[t_name]
bn.null_values = null_values[t_name]
print(len(data[t_name]))
bn.update_from_data(data[t_name])
test_trained_BN_on_stats(bn, t_name)

88497
None
postHistory.CreationDate continuous
{0: Interval(-45913128.001, 806892.3, closed='right'), 1: Interval(806892.3, 2224659.45, closed='right'), 2: Interval(2224659.45, 4448732.5, closed='right'), 3: Interval(4448732.5, 6398143.0, closed='right'), 4: Interval(6398143.0, 8332076.5, closed='right'), 5: Interval(8332076.5, 10505734.6, closed='right'), 6: Interval(10505734.6, 12502397.475, closed='right'), 7: Interval(12502397.475, 15210418.0, closed='right'), 8: Interval(15210418.0, 17234072.425, closed='right'), 9: Interval(17234072.425, 18875118.5, closed='right'), 10: Interval(18875118.5, 20220825.85, closed='right'), 11: Interval(20220825.85, 21495720.0, closed='right'), 12: Interval(21495720.0, 22932501.675, closed='right'), 13: Interval(22932501.675, 24359971.65, closed='right'), 14: Interval(24359971.65, 25554386.0, closed='right'), 15: Interval(25554386.0, 26935924.8, closed='right'), 16: Interval(26935924.8, 28476299.825, closed='right'), 17: Interval(28476299.825, 298872



done, incremental parameter updating took 0.37574028968811035 secs.


KeyError: 120

In [None]:
pred

In [None]:
for table in FJmodel.schema.tables:
    t_name = table.table_name
    print(t_name)
    if t_name in data and data[t_name] is not None:
        bn = FJmodel.bns[t_name]
        bn.null_values = null_values[t_name]
        bn.update_from_data(data[t_name])
        test_trained_BN_on_stats(bn, t_name)

In [None]:
a = [1,2,2,3,]
a.remove(1)
a

In [None]:
np.ones(1).astype(int)

In [None]:
a = np.arange(10)
a = a.delete(0)
a