In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy import stats
import time
import sys
sys.path.append("/home/ubuntu/CE_scheme")
from Schemas.stats.schema import gen_stats_light_schema
from Evaluation.training import train_one_stats
from Join_scheme.data_prepare import process_stats_data, update_stats_data
from BayesCard.Models.Bayescard_BN import Bayescard_BN

In [2]:
data_folder = '/home/ubuntu/End-to-End-CardEst-Benchmark/datasets/stats_simplified'

In [3]:
def read_table_csv(table_obj, csv_seperator=',', stats=True):
    """
    Reads csv from path, renames columns and drops unnecessary columns
    """
    if stats:
        df_rows = pd.read_csv(table_obj.csv_file_location)
    else:
        df_rows = pd.read_csv(table_obj.csv_file_location, header=None, escapechar='\\', encoding='utf-8',
                              quotechar='"',
                              sep=csv_seperator)
    df_rows.columns = [table_obj.table_name + '.' + attr for attr in table_obj.attributes]

    for attribute in table_obj.irrelevant_attributes:
        df_rows = df_rows.drop(table_obj.table_name + '.' + attribute, axis=1)

    return df_rows.apply(pd.to_numeric, errors="ignore")


def timestamp_transorform(time_string, start_date="2010-07-19 00:00:00"):
    start_date_int = time.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    time_array = time.strptime(time_string, "%Y-%m-%d %H:%M:%S")
    return int(time.mktime(time_array)) - int(time.mktime(start_date_int))


def get_data_by_date(data_path, time_date="2014-01-01 00:00:00"):
    time_value = timestamp_transorform(time_date)
    if not data_path.endswith(".csv"):
        data_path += "/{}.csv"
    schema = gen_stats_light_schema(data_path)
    before_data = dict()
    after_data = dict()
    for table_obj in schema.tables:
        table_name = table_obj.table_name
        df_rows = read_table_csv(table_obj)
        idx = len(df_rows)
        for attribute in df_rows.columns:
            if "Date" in attribute:
                idx = np.searchsorted(df_rows[attribute].values, time_value)
                break
                
        before_data[table_name] = df_rows[:idx] if idx > 0 else None
        after_data[table_name] = df_rows[idx:] if idx < len(df_rows) else None
    return before_data, after_data


In [4]:
before_data, after_data = get_data_by_date(data_folder)

In [5]:
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
data, null_values, key_attrs, table_buckets, equivalent_keys, schema, bin_size = process_stats_data(data_folder,
                                        model_path, 200, "sub_optimal", False, data=before_data)


bucketizing equivalent key group: {'postLinks.PostId', 'posts.Id', 'tags.ExcerptPostId', 'comments.PostId', 'postHistory.PostId', 'postLinks.RelatedPostId', 'votes.PostId'}
bucketizing equivalent key group: {'comments.UserId', 'postHistory.UserId', 'badges.UserId', 'users.Id', 'posts.OwnerUserId', 'votes.UserId'}


In [6]:
t_name = "posts"
bn = Bayescard_BN(t_name, key_attrs[t_name], bin_size[t_name], null_values=null_values[t_name])
bn.build_from_data(data[t_name])

Discretizing table takes 0.5268080234527588 secs
Structure learning took 12.581604480743408 secs.
done, parameter learning took 0.21622991561889648 secs.


In [None]:
model_path = "/home/ubuntu/data_CE/CE_scheme_models/update/"
train_one_stats("stats", data_folder, model_path, 200, "sub_optimal", True, actual_data=before_data)

In [7]:
with open(model_path + "buckets.pkl", "rb") as f:
    buckets = pickle.load(f)
with open(model_path + "model_stats_sub_optimal_200.pkl", "rb") as f:
    FJmodel = pickle.load(f)

In [None]:
query_file = "/home/ubuntu/End-to-End-CardEst-Benchmark/workloads/stats_CEB/sub_plan_queries/stats_CEB_sub_queries.sql"
with open(query_file, "r") as f:
    queries = f.readlines()


In [None]:
qerror = []
latency = []
pred = []
for i, query_str in enumerate(queries):
    query = query_str.split("||")[0][:-1]
    #print("========================")
    true_card = int(query_str.split("||")[-1])
    t = time.time()
    res = FJmodel.get_cardinality_bound(query)
    pred.append(res)
    latency.append(time.time() - t)
    qerror.append(res/true_card)
    #print(f"estimating query {i}: predicted {res}, true_card {true_card}, qerror {res/true_card}, latency {time.time() - t}")

In [None]:
for i in [50, 90, 95, 99, 100]:
    print(f"q-error {i}% percentile is {np.percentile(qerror, i)}")
print(f"total inference time: {np.sum(latency)}")

In [8]:
table_buckets = FJmodel.table_buckets
null_values = FJmodel.null_value
data, table_buckets, null_values = update_stats_data(data_folder, model_path, buckets, table_buckets,
                                                     null_values, False, after_data)

tags does not have data to update
updating equivalent key group: {'postLinks.PostId', 'posts.Id', 'tags.ExcerptPostId', 'comments.PostId', 'postHistory.PostId', 'postLinks.RelatedPostId', 'votes.PostId'}
tags.ExcerptPostId
updating equivalent key group: {'comments.UserId', 'postHistory.UserId', 'badges.UserId', 'users.Id', 'posts.OwnerUserId', 'votes.UserId'}


In [9]:
bn.update_from_data(data["posts"])

Discretizing table took 0.35541391372680664 secs.




ValueError: -1 is not in list

In [None]:
for table in FJmodel.schema.tables:
    t_name = table.table_name
    print(t_name)
    if t_name in data and data[t_name] is not None:
        bn = FJmodel.bns[t_name]
        bn.null_values = null_values[t_name]
        bn.update_from_data(data[t_name])

In [None]:
np.union1d(np.arange(3), np.arange(4)+1)