In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pareto
import sys
import pickle
sys.path.append('/Users/ziniuwu/Desktop/research/BayesCard')
from Models.Bayescard_BN import Bayescard_BN
from time import perf_counter
from Evaluation.utils import parse_query
from Evaluation.cardinality_estimation import parse_query_single_table

In [8]:
def discretize_series(s, domain_size):
    n_invalid = len(s[s>=domain_size])
    s = s[s<domain_size]
    s = np.floor(s)
    new_s = np.random.randint(domain_size, size=n_invalid)
    s = np.concatenate((s, new_s))
    return np.random.permutation(s)
    
def data_generation(skew, domain_size, correlation, column_size, nrows=1000000):
    data = np.zeros((column_size, nrows))
    for i in range(column_size):
        if i == 0:
            s = np.random.randint(domain_size, size=nrows)
            data[i,:] = s
            continue
        s = pareto.rvs(b=skew, scale=1, size=nrows)
        s = discretize_series(s, domain_size)
        if i == 1:
            selected_cols = [0]
        else:
            #num_selected_cols = max(np.random.randint(int(np.ceil(i*0.1))), 1)
            num_selected_cols = 1
            selected_cols = np.random.permutation(i)[0:num_selected_cols]
        idx = np.random.permutation(nrows)[0:int(nrows*correlation)]
        if len(idx) != 0:
            selected_data = data[selected_cols, :]
            selected_data = np.ceil(np.mean(selected_data, axis=0))
            s[idx] = selected_data[idx]
        assert len(np.unique(s)) <= domain_size, "invalid domain"
        data[i,:] = s
        
    data = pd.DataFrame(data=data.transpose(), columns=[f"attr{i}" for i in range(column_size)])
    return data

def query_generation(data, table_name, num_sample=200, p=0.8, nval_per_col=4, skip_zero_bit=6):
    queries = []
    cards = []
    for i in range(num_sample):
        query, card = generate_single_query(data, table_name, p, nval_per_col, skip_zero_bit)
        while query is None:
            query, card = generate_single_query(data, table_name, p, nval_per_col, skip_zero_bit)
        queries.append(query)
        cards.append(card)
    return queries, cards

def generate_single_query(df, table_name, p=0.8, nval_per_col=4, skip_zero_bit=6):
    """
    p, nval_per_col, and skip_zero_bit are controlling the true cardinality size. As we know smaller true card 
    generally leads to larger q-error, which will bias the experimental result, so we use this to control the 
    true card to be similar for all experiments.
    """
    query = f"SELECT COUNT(*) FROM {table_name} WHERE "
    execute_query = ""
    column_names = df.columns
    n_cols = 0
    for i, col in enumerate(column_names):
        a = np.random.choice([0,1], p=[p,1-p])
        if a == 0:
            index = np.random.choice(len(df), size=nval_per_col)
            val = sorted(list(df[col].iloc[index]))
            left_val = val[0]
            right_val = val[-1]
            if left_val == right_val:
                sub_query = col + '==' + str(left_val) + ' and '
                act_sub_query = col + ' = ' + str(left_val) + ' AND '
            else:
                if skip_zero_bit:
                    left_val += skip_zero_bit
                    right_val += skip_zero_bit
                sub_query = str(left_val) + ' <= ' + col + ' <= ' + str(right_val) + ' and '
                act_sub_query = col + ' >= ' + str(left_val) + ' AND ' + col + ' <= ' + str(right_val) + ' AND '
            execute_query += sub_query
            query += act_sub_query
    if execute_query == "":
        return None,  None
    execute_query = execute_query[:-5]
    query = query[:-5]
    try:
        card = len(df.query(execute_query))
    except:
        card = 0
    if card==0:
        return None, None
    return query, card

In [9]:
skew=1.0
domain_size=100
correlation=0.4
column_size=10
nrows=1000000
num_sample=200
p=0.8
nval_per_col=4
skip_zero_bit=6
rows_to_use=10000
n_mcv=30
n_bins=70
data = data_generation(skew, domain_size, correlation, column_size, nrows=nrows)
name = f"toy_{skew}_{domain_size}_{correlation}_{column_size}"
queries, cards = query_generation(data, name, num_sample, p, nval_per_col, skip_zero_bit)
BN = Bayescard_BN(name)
BN.build_from_data(data, sample_size=rows_to_use, n_mcv=n_mcv, n_bins=n_bins)

Discretizing table takes 4.798412084579468 secs
Structure learning took 1.342918872833252 secs.
done, parameter learning took 1.5890908241271973 secs.


In [10]:
data2 = data_generation(skew, domain_size, correlation, column_size, nrows=100000)
BN.update(data2)

In [17]:
vc = data2['attr0'].value_counts()

In [25]:
vc[53]

1088