In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import copy
import sys
import os
sys.path.append("../")
from parser.utils import load_json, dfs_cardinality, estimate_scan_in_mb
from models.feature.single_xgboost_feature import find_top_k_operators, featurize_one_plan, get_top_k_table_by_size
from utils.load_brad_trace import load_trace, create_concurrency_dataset, load_trace_all_version
from models.concurrency.utils import pre_info_train_test_seperation
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from models.single.stage import SingleStage
from models.concurrency.complex_models import ConcurrentRNN

np.set_printoptions(suppress=True)

In [9]:
import pandas as pd
import numpy as np
from typing import Optional, Tuple
from utils.load_brad_trace import (
    load_trace,
    create_concurrency_dataset,
    load_trace_all_version,
)
from models.single.stage import SingleStage
from models.concurrency.complex_models import ConcurrentRNN
from scheduler.base_scheduler import BaseScheduler


class QueryBank:
    def __init__(
        self, sql_query_file: str, query_runtime_path: str, seed: int = 0
    ) -> None:
        with open(sql_query_file, "r") as f:
            sql_queries = f.readlines()
        query_runtime = np.load(query_runtime_path)
        assert len(sql_queries) == len(query_runtime)
        idx = np.argsort(query_runtime)
        self.query_runtime = query_runtime[idx]
        self.sql_queries = [sql_queries[i] for i in idx]
        self.query_len = len(self.query_runtime)
        np.random.seed(seed)

    def random_sample(self) -> (str, float):
        # make a random sample of the query
        idx = np.random.randint(self.query_len)
        return self.sql_queries[idx], self.query_runtime[idx]

    def sample_by_runtime(self, runtime: float) -> (str, float):
        # sample a query that best matches the runtime
        idx = np.searchsorted(self.query_runtime, runtime)
        idx = max(idx, self.query_len - 1)
        return self.sql_queries[idx], self.query_runtime[idx]


class Simulator:
    def __init__(
        self, scheduler: BaseScheduler, query_bank: Optional[QueryBank] = None, pause_wait_s: float = 5.0
    ):
        self.scheduler = scheduler
        self.query_bank = query_bank
        self.pause_wait_s = pause_wait_s

    def replay_one_query(self, start_time: float, next_query_start_time: Optional[float] = None,
                         query_str: Optional[int] = None, query_idx: Optional[int] = None):
        # Todo: this logical should go to the scheduler
        should_immediate_re_ingest, should_pause_and_re_ingest, scheduled_submit = self.scheduler.ingest_query_simulation(
            start_time, query_str=query_str, query_idx=query_idx
        )
        if should_immediate_re_ingest:
            # the scheduler schedules one query at a time even if there are multiple queries in the queue, so need to call again
            self.replay_one_query(start_time + 0.001, next_query_start_time)
        if should_pause_and_re_ingest:
            if next_query_start_time is not None and next_query_start_time <= start_time + self.pause_wait_s:
                return
            self.replay_one_query(start_time + self.pause_wait_s, next_query_start_time)

    def replay_workload(self, directory: str) -> Tuple[np.ndarray, np.ndarray]:
        all_raw_trace, all_trace = load_trace(directory, 8, concat=True)
        concurrency_df = create_concurrency_dataset(all_trace, engine=None, pre_exec_interval=200)
        concurrency_df = concurrency_df.sort_values(by=['start_time'], ascending=True)
        original_predictions = self.scheduler.make_original_prediction(concurrency_df)
        assert len(concurrency_df) == len(original_predictions)
        original_runtime = []
        all_start_time = concurrency_df["start_time"].values
        all_query_idx = concurrency_df["query_idx"].values
        for i in range(len(concurrency_df)):
            original_runtime.append(original_predictions[i])
            # replaying the query one-by-one
            if i < len(concurrency_df):
                next_query_start_time = all_start_time[i + 1]
            else:
                next_query_start_time = None
            self.replay_one_query(all_start_time[i], next_query_start_time, i, all_query_idx[i])
        # finish all queries
        self.scheduler.finish_query(np.infty)
        new_runtime = []
        for i in range(len(concurrency_df)):
            new_runtime.append(self.scheduler.all_query_runtime[i])
        original_runtime = np.asarray(original_runtime)
        new_runtime = np.asarray(new_runtime)
        return original_runtime, new_runtime


In [10]:
import numpy as np
from typing import Optional, Tuple, List, Union, MutableMapping
from models.single.stage import SingleStage
from models.concurrency.complex_models import ConcurrentRNN
from scheduler.base_scheduler import BaseScheduler


class GreedyScheduler(BaseScheduler):
    def __init__(
        self,
        stage_model: SingleStage,
        predictor: ConcurrentRNN,
        max_concurrency_level: int = 10,
        min_concurrency_level: int = 2,
    ):
        super(GreedyScheduler, self).__init__(
            stage_model, predictor, max_concurrency_level, min_concurrency_level
        )

    def ingest_query_simulation(
        self,
        start_t: float,
        query_str: Optional[Union[str, int]] = None,
        query_idx: Optional[int] = None,
    ) -> Tuple[bool, bool, Optional[float]]:
        """We work on planning the currently queued queries if query_str is None (i.e., no query submitted)"""
        self.current_time = start_t
        self.finish_query()
        should_immediate_re_ingest = False
        should_pause_and_re_ingest = False
        scheduled_submit = None
        if query_str is not None:
            self.queued_queries.append(query_str)
            self.queued_queries_enter_time.append(start_t)
            query_feature = self.stage_model.featurize_online(query_idx)
            self.queued_query_features.append(query_feature)

        if len(self.queued_query_features) == 0:
            # nothing to do when there is no query in the queue
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                scheduled_submit,
            )
        if len(self.existing_finish_time) == 0:
            next_finish_idx = None
            next_finish_time = None
        else:
            next_finish_idx = np.argmin(self.existing_finish_time)
            next_finish_time = self.existing_finish_time[next_finish_idx]

        predictions, global_x, global_pre_info_length = self.predictor.online_inference(
            self.existing_query_features,
            self.existing_query_concur_features,
            self.existing_pre_info_length,
            self.queued_query_features,
            self.existing_start_time,
            start_t,
            next_finish_idx=next_finish_idx,
            next_finish_time=next_finish_time,
            get_next_finish=True
        )

        predictions = predictions.reshape(-1).detach().numpy()
        # Todo: add algorithms to decide whether to put in queue or directly for execution
        if len(self.running_queries) == 0:
            # submit the shortest running query in queue when there is no query running
            # Todo: this is not optimal
            assert len(predictions) == 2 * len(self.queued_queries)

            predictions_query = predictions[0:-1:2]
            selected_idx = np.argmin(predictions_query)
            self.submit_query(
                selected_idx,
                self.queued_queries[selected_idx],
                predictions_query[selected_idx],
                self.queued_query_features[selected_idx],
                start_t,
                self.queued_queries_enter_time[selected_idx],
                float(predictions_query[selected_idx]) + start_t,
                None,
                int(global_pre_info_length[selected_idx * 2]),
            )
            should_immediate_re_ingest = True
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                scheduled_submit,
            )
        elif len(self.running_queries) >= self.max_concurrency_level:
            # when the system is overloaded, should pause and retry
            should_pause_and_re_ingest = True
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                scheduled_submit,
            )
        else:
            all_score = []
            all_query_idx = []
            for i in range(len(self.queued_queries)):
                pred_idx = i * (2 + len(self.existing_query_concur_features))
                curr_pred = predictions[pred_idx]
                submit_after_pred = predictions[pred_idx + 1]
                # how does the predicted runtime of submitting now compare to submitting later
                curr_delta = curr_pred - submit_after_pred + (next_finish_time - start_t)
                old_existing_pred = np.asarray(self.existing_runtime_prediction)
                new_existing_pred = predictions[
                                    (pred_idx + 2): (
                                            pred_idx + len(self.existing_query_concur_features) + 2
                                    )
                                    ]
                # how will this query change the runtime of existing queries in the system ()
                delta = new_existing_pred - old_existing_pred
                delta_sum = np.sum(delta)
                #print(self.queued_queries[i], curr_delta, delta_sum, delta)
                # for every query first judge whether it is good to wait
                if curr_delta + delta_sum < 0:
                    # when the current system state benefit the current query more than
                    # this query's (probably negative) impact on the running queries
                    # more optimal to submit now than later
                    all_score.append(delta_sum + curr_delta)
                    all_query_idx.append(i)
                    # todo: add more clever conditions
            if len(all_score) == 0:
                should_immediate_re_ingest = False
                should_pause_and_re_ingest = False
                # Todo implement scheduled submit in the future
                # now we just pause and wait for re_ingest
                scheduled_submit = None
            else:
                best_query_idx = np.argmin(all_score)
                selected_idx = all_query_idx[best_query_idx]
                converted_idx = selected_idx * (
                    2 + len(self.existing_query_concur_features)
                )
                curr_pred_runtime = predictions[converted_idx]
                finish_t = start_t + curr_pred_runtime
                existing_query_concur_features = global_x[converted_idx]
                new_existing_pred = predictions[
                    (converted_idx + 2) : (
                        converted_idx + len(self.existing_query_concur_features) + 2
                    )
                ]
                new_existing_finish_time = []
                for i in range(len(self.existing_start_time)):
                    new_existing_finish_time.append(
                        new_existing_pred[i] + self.existing_start_time[i]
                    )
                new_existing_query_concur_feature = global_x[
                    (converted_idx + 2) : (
                        converted_idx + len(self.existing_query_concur_features) + 2
                    )
                ]
                self.submit_query(
                    selected_idx,
                    self.queued_queries[selected_idx],
                    curr_pred_runtime,
                    self.queued_query_features[selected_idx],
                    start_t,
                    self.queued_queries_enter_time[selected_idx],
                    finish_t,
                    existing_query_concur_features,
                    int(global_pre_info_length[converted_idx]),
                    new_existing_finish_time,
                    list(new_existing_pred),
                    new_existing_query_concur_feature,
                )
                should_immediate_re_ingest = True
                should_pause_and_re_ingest = False
                scheduled_submit = None
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                scheduled_submit,
            )


In [4]:
parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/mixed_aurora/aurora_mixed_parsed_queries.json"
plans = load_json(parsed_queries_path, namespace=False)
folder_name = "mixed_aurora"
directory = f"/Users/ziniuw/Desktop/research/Data/AWS_trace/{folder_name}/"
all_raw_trace, all_trace = load_trace_all_version(directory, 8, concat=True)
all_concurrency_df = []
for trace in all_trace:
    concurrency_df = create_concurrency_dataset(trace, engine=None, pre_exec_interval=200)
    all_concurrency_df.append(concurrency_df)
concurrency_df = pd.concat(all_concurrency_df, ignore_index=True)
train_trace_df_sep, eval_trace_df_sep = pre_info_train_test_seperation(concurrency_df)
print(len(train_trace_df_sep), len(eval_trace_df_sep))
np.random.seed(0)
train_idx = np.random.choice(len(concurrency_df), size=int(0.8 * len(concurrency_df)), replace=False)
test_idx = [i for i in range(len(concurrency_df)) if i not in train_idx]
train_trace_df = copy.deepcopy(concurrency_df.iloc[train_idx])
eval_trace_df = concurrency_df.iloc[test_idx]
eval_trace_df = copy.deepcopy(eval_trace_df[eval_trace_df['num_concurrent_queries'] > 0])
print(len(train_trace_df), len(eval_trace_df))

28925 25561
43967 10907


In [5]:
concurrency_df = create_concurrency_dataset(all_trace[4], engine=None, pre_exec_interval=200)
len(concurrency_df)

7247

In [6]:
ss = SingleStage(use_table_features=True, true_card=False, use_median=True)
#df = ss.featurize_data(train_trace_df, parsed_queries_path)
df = ss.featurize_data(concurrency_df, parsed_queries_path)
ss.train(df)
rnn = ConcurrentRNN(ss, 
                    input_size=len(ss.all_feature[0]) * 2 + 7,
                    embedding_dim=128,
                    hidden_size=256,
                    num_layers=2,
                    loss_function="q_loss",
                    last_output=True,
                    use_seperation=False
                   )
rnn.load_model("checkpoints")
preds, labels = rnn.predict(eval_trace_df_sep, use_pre_info_only=False, return_per_query=False)

Top 20 operators contains 0.9650782102582758 total operators


100%|████████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 162.89it/s]

50% absolute error is 0.9629056453704834, q-error is 1.7662272453308105
90% absolute error is 8.765890121459961, q-error is 6.40096378326416
95% absolute error is 19.13550567626953, q-error is 10.512479782104492





In [17]:
scheduler = GreedyScheduler(ss, rnn)
simulator = Simulator(scheduler)
concurrency_df = concurrency_df.sort_values(by=['start_time'], ascending=True)
original_predictions = scheduler.make_original_prediction(concurrency_df)
assert len(concurrency_df) == len(original_predictions)

100%|███████████████████████████████████████████████████████████████████████████| 57/57 [00:01<00:00, 52.59it/s]

50% absolute error is 2.369178056716919, q-error is 1.2800177335739136
90% absolute error is 27.301742172241227, q-error is 3.465474414825442
95% absolute error is 53.84303741455077, q-error is 5.859610033035272





In [18]:
original_runtime = []
all_start_time = concurrency_df["start_time"].values
all_query_idx = concurrency_df["query_idx"].values
for i in range(len(concurrency_df)):
    original_runtime.append(original_predictions[i])
    # replaying the query one-by-one
    if i < len(concurrency_df) - 1:
        next_query_start_time = all_start_time[i + 1]
    else:
        next_query_start_time = None
    #print("==============================", i, original_predictions[i])
    simulator.replay_one_query(all_start_time[i], next_query_start_time, i, all_query_idx[i])
    #scheduler.print_state()
simulator.scheduler.finish_query(np.infty)

In [19]:
new_runtime = []
for i in range(len(concurrency_df)):
    new_runtime.append(simulator.scheduler.all_query_runtime[i])
original_runtime = np.asarray(original_runtime)
new_runtime = np.asarray(new_runtime)

KeyError: 7242

In [29]:
scheduler.running_queries, scheduler.queued_queries

([], [7242, 7246])

In [31]:
st = all_start_time[-1]
while len(scheduler.queued_queries) != 0:
    simulator.replay_one_query(st + 3, None)
    print(scheduler.running_queries, scheduler.queued_queries)
    st += 3

[7246] [7242]
[7242] []


In [23]:
o_r = original_runtime[:7242]
n_r = new_runtime

In [35]:
np.random.normal(2, 1)

1.3403128827096682

In [27]:
rt = concurrency_df['runtime'].values[:7242]

In [24]:
np.mean(o_r), np.percentile(o_r, 50), np.percentile(o_r, 90), np.percentile(o_r, 95), np.percentile(o_r, 99)

(50.01323,
 10.085129737854004,
 150.9403045654297,
 239.2269371032714,
 487.9498553466801)

In [25]:
np.mean(n_r), np.percentile(n_r, 50), np.percentile(n_r, 90), np.percentile(n_r, 95), np.percentile(n_r, 99)

(17.593487873728154,
 5.124333381652832,
 42.939343800512546,
 73.74249505377956,
 189.0873755398267)

In [28]:
np.mean(rt), np.percentile(rt, 50), np.percentile(rt, 90), np.percentile(rt, 95), np.percentile(rt, 99)

(58.2194605004027,
 10.538796186447144,
 171.31692030429855,
 280.9423947334289,
 579.0570925283433)

In [None]:
simulator.replay_one_query(1.87, 2.76, 143, 143)
scheduler.print_state()

In [17]:
a = torch.zeros((3, 4))

In [18]:
a[0,1] = 1

In [19]:
a

tensor([[0., 1., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])