In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pareto
import sys
import pickle
sys.path.append('/Users/ziniuwu/Desktop/research/BayesCard')
from Models.Bayescard_BN import Bayescard_BN, build_meta_info
from time import perf_counter
from Evaluation.utils import parse_query
from Evaluation.cardinality_estimation import parse_query_single_table

In [2]:
def discretize_series(s, domain_size):
    n_invalid = len(s[s>=domain_size])
    s = s[s<domain_size]
    s = np.floor(s)
    new_s = np.random.randint(domain_size, size=n_invalid)
    s = np.concatenate((s, new_s))
    return np.random.permutation(s)
    
def data_generation(skew, domain_size, correlation, column_size, nrows=1000000):
    data = np.zeros((column_size, nrows))
    for i in range(column_size):
        if i == 0:
            s = np.random.randint(domain_size, size=nrows)
            data[i,:] = s
            continue
        s = pareto.rvs(b=skew, scale=1, size=nrows)
        s = discretize_series(s, domain_size)
        if i == 1:
            selected_cols = [0]
        else:
            #num_selected_cols = max(np.random.randint(int(np.ceil(i*0.1))), 1)
            num_selected_cols = 1
            selected_cols = np.random.permutation(i)[0:num_selected_cols]
        idx = np.random.permutation(nrows)[0:int(nrows*correlation)]
        if len(idx) != 0:
            selected_data = data[selected_cols, :]
            selected_data = np.ceil(np.mean(selected_data, axis=0))
            s[idx] = selected_data[idx]
        assert len(np.unique(s)) <= domain_size, "invalid domain"
        data[i,:] = s
        
    data = pd.DataFrame(data=data.transpose(), columns=[f"attr{i}" for i in range(column_size)])
    return data

def query_generation(data, table_name, num_sample=200, p=0.8, nval_per_col=4, skip_zero_bit=6):
    queries = []
    cards = []
    for i in range(num_sample):
        query, card = generate_single_query(data, table_name, p, nval_per_col, skip_zero_bit)
        while query is None:
            query, card = generate_single_query(data, table_name, p, nval_per_col, skip_zero_bit)
        queries.append(query)
        cards.append(card)
    return queries, cards

def generate_single_query(df, table_name, p=0.8, nval_per_col=4, skip_zero_bit=6):
    """
    p, nval_per_col, and skip_zero_bit are controlling the true cardinality size. As we know smaller true card 
    generally leads to larger q-error, which will bias the experimental result, so we use this to control the 
    true card to be similar for all experiments.
    """
    query = f"SELECT COUNT(*) FROM {table_name} WHERE "
    execute_query = ""
    column_names = df.columns
    n_cols = 0
    for i, col in enumerate(column_names):
        a = np.random.choice([0,1], p=[p,1-p])
        if a == 0:
            index = np.random.choice(len(df), size=nval_per_col)
            val = sorted(list(df[col].iloc[index]))
            left_val = val[0]
            right_val = val[-1]
            if left_val == right_val:
                sub_query = col + '==' + str(left_val) + ' and '
                act_sub_query = col + ' = ' + str(left_val) + ' AND '
            else:
                if skip_zero_bit:
                    left_val += skip_zero_bit
                    right_val += skip_zero_bit
                sub_query = str(left_val) + ' <= ' + col + ' <= ' + str(right_val) + ' and '
                act_sub_query = col + ' >= ' + str(left_val) + ' AND ' + col + ' <= ' + str(right_val) + ' AND '
            execute_query += sub_query
            query += act_sub_query
    if execute_query == "":
        return None,  None
    execute_query = execute_query[:-5]
    query = query[:-5]
    try:
        card = len(df.query(execute_query))
    except:
        card = 0
    if card==0:
        return None, None
    return query, card

In [3]:
skew=1.0
domain_size=100
correlation=0.4
column_size=10
nrows=1000000
num_sample=200
p=0.8
nval_per_col=4
skip_zero_bit=6
rows_to_use=10000
n_mcv=30
n_bins=70
data = data_generation(skew, domain_size, correlation, column_size, nrows=nrows)
data['attr10'] = np.arange(len(data))
data['fanout_mul_1'] = np.random.randint(50, size=len(data))
data['fanout_mul_1_nn'] = np.random.randint(50, size=len(data))
name = f"toy_{skew}_{domain_size}_{correlation}_{column_size}"
queries, cards = query_generation(data, name, num_sample, p, nval_per_col, skip_zero_bit)
meta_info = build_meta_info(list(data.columns), None)
BN = Bayescard_BN(name, full_join_size=len(data), meta_info = meta_info)
BN.build_from_data(data, sample_size=rows_to_use, n_mcv=n_mcv, n_bins=n_bins)

In [4]:
def test_CardEst(BN, queries, cards):
    BN.infer_algo = "exact-jit"
    BN.init_inference_method()
    latencies = []
    q_errors = []
    for query_no, query_str in enumerate(queries):
        query = parse_query_single_table(query_str.strip(), BN)
        cardinality_true = cards[query_no]
        card_start_t = perf_counter()
        cardinality_predict = BN.query(query)
        card_end_t = perf_counter()
        latency_ms = (card_end_t - card_start_t) * 1000
        if cardinality_predict == 0 and cardinality_true == 0:
            q_error = 1.0
        elif np.isnan(cardinality_predict) or cardinality_predict == 0:
            cardinality_predict = 1
            q_error = max(cardinality_predict / cardinality_true, cardinality_true / cardinality_predict)
        elif cardinality_true == 0:
            cardinality_true = 1
            q_error = max(cardinality_predict / cardinality_true, cardinality_true / cardinality_predict)
        else:
            q_error = max(cardinality_predict / cardinality_true, cardinality_true / cardinality_predict)
        latencies.append(latency_ms)
        q_errors.append(q_error)
    for i in [50, 90, 95, 99, 100]:
        print(f"q-error {i}% percentile is {np.percentile(q_errors, i)}")
    print(f"average latency is {np.mean(latencies)} ms")
    return q_errors, latencies

In [None]:
print("the original model performance:")
_, _ = test_CardEst(BN, queries, cards)

In [5]:
data_stale = data[0:700000]
data_update = data[700000:]
meta_info = build_meta_info(list(data_stale.columns), None)
BN = Bayescard_BN(name, full_join_size=len(data_stale), meta_info = meta_info)
BN.build_from_data(data_stale, sample_size=rows_to_use, n_mcv=n_mcv, n_bins=n_bins)
BN.update_from_data(data_update)

Discretizing table takes 4.318660259246826 secs
Structure learning took 1.5511717796325684 secs.
done, parameter learning took 1.6098380088806152 secs.
Discretizing table took 2.398975133895874 secs.




done, incremental parameter updating took 1.0968120098114014 secs.


In [7]:
_,_ = test_CardEst(BN, queries, cards)

q-error 50% percentile is 1.5329097427360434
q-error 90% percentile is 3.3354626337336586
q-error 95% percentile is 4.627138520947003
q-error 99% percentile is 12.425659946460712
q-error 100% percentile is 19.420508824896515
average latency is 7.003136390000293 ms


In [None]:
data2 = data_generation(2.0, 300, correlation, column_size, nrows=100000)
data2['attr10'] = np.arange(len(data2))-50000
data2['fanout_mul_1'] = np.random.randint(50, size=len(data2))
data2['fanout_mul_1_nn'] = np.random.randint(50, size=len(data2))
new_BN = BN.update_from_data(data2)

In [None]:
for col in BN.attr_type:
    if BN.attr_type[col] == "continuous":
        assert set(BN.mapping[col].keys()).issubset(set(BN.mapping_update[col].keys())), col
    else:
        assert set(BN.encoding[col].values()).issubset(set(BN.encoding_update[col].values())), col

In [None]:
BN.mapping['attr10']

In [None]:
print(list(BN.mapping['attr10'].values()))

In [None]:
for i in range(76):
    print(np.sum(BN.model.cpds[10].values[:,i]))

In [None]:
print(BN.model.cpds[10].evidence)

In [None]:
print(new_BN.cpds[10].state_names)

In [None]:
bins = [a[0].left] + [a[k].right for k in a]

In [None]:
temp = pd.cut(b, bins=bins)

In [None]:
i = sorted(list(temp.unique()), key=lambda x: x.left)[1]

In [None]:
vc.nlargest(10)[36.0]

In [None]:
np.asarray(temp[temp==i].index)

In [None]:
np.concatenate((np.arange(3), [4]))

In [None]:
set([1,2,3,4])|set([1,4,5])

In [None]:
np.unique(np.asarray(list(BN.encoding['attr9'].values())))

In [None]:
vc[61]

In [None]:
BN.n_in_bin['attr2']

In [None]:
np.isclose(1.0000001, 1)

In [None]:
set([1,2]) == set([2,1])

In [None]:
set([2,2,2,3,1,4])

In [None]:
a = np.ones(tuple([4,4]))

In [None]:
indx = [[1,2,3], [1,2,3], [1,3]]
#a[np.array(indx[0]), indx[1], indx[2]]
a[np.array(indx[0]), np.array(indx[1]), :].shape

In [None]:
import pandas as pd

In [None]:
data_df = pd.DataFrame(a[0,:,:])

In [None]:
a[[1,2,3]][:,[1,2,3]][:,:,[1,3]] = 0

In [None]:
a[:3][:3][:2] = 0

In [None]:
indx = np.asarray([[1,2],[1,2]])
a[indx].shape

In [None]:
a[np.arange(a.shape[0])[:,None],indx]

In [None]:
a = np.arange(16).reshape((4,4))

In [None]:
a

In [None]:
b = [np.asarray([1,2]),np.asarray([1,2,3])]

In [None]:
a[tuple([b[0].reshape(-1, 1), b[1]])]

In [None]:
tuple([b[0].reshape(-1, 1), b[1]])

In [None]:
b[0].reshape((-1, 1, 1)).shape

In [None]:
def multi_dim_index(a, index):
    assert a.ndim == len(index)
    new_index = []
    n = len(index)
    for i, ind in enumerate(index):
        ind = np.asarray(ind)
        if i != n-1:
            new_shape = tuple([-1] + [1]*(n-i-1))
        else:
            new_shape = -1
        new_index.append(ind.reshape(new_shape))
    a[tuple(new_index)] = np.ones((2, 3))
    return a

In [None]:
multi_dim_index(a, b)