In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import copy
import sys
import os
sys.path.append("../")
from parser.utils import load_json, dfs_cardinality, estimate_scan_in_mb
from models.feature.single_xgboost_feature import find_top_k_operators, featurize_one_plan, get_top_k_table_by_size
from utils.load_brad_trace import load_trace, create_concurrency_dataset, load_trace_all_version
from models.concurrency.utils import pre_info_train_test_seperation
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from models.single.stage import SingleStage
from models.concurrency.complex_models import ConcurrentRNN

np.set_printoptions(suppress=True)

In [2]:
import numpy as np
from typing import Optional, Tuple
from utils.load_brad_trace import (
    load_trace,
    create_concurrency_dataset,
)
from scheduler.base_scheduler import BaseScheduler


class QueryBank:
    def __init__(
        self, sql_query_file: str, query_runtime_path: str, seed: int = 0
    ) -> None:
        with open(sql_query_file, "r") as f:
            sql_queries = f.readlines()
        query_runtime = np.load(query_runtime_path)
        assert len(sql_queries) == len(query_runtime)
        idx = np.argsort(query_runtime)
        self.query_runtime = query_runtime[idx]
        self.sql_queries = [sql_queries[i] for i in idx]
        self.query_len = len(self.query_runtime)
        np.random.seed(seed)

    def random_sample(self) -> (str, float):
        # make a random sample of the query
        idx = np.random.randint(self.query_len)
        return self.sql_queries[idx], self.query_runtime[idx]

    def sample_by_runtime(self, runtime: float) -> (str, float):
        # sample a query that best matches the runtime
        idx = np.searchsorted(self.query_runtime, runtime)
        idx = max(idx, self.query_len - 1)
        return self.sql_queries[idx], self.query_runtime[idx]


class Simulator:
    def __init__(
        self,
        scheduler: BaseScheduler,
        query_bank: Optional[QueryBank] = None,
        pause_wait_s: float = 5.0,
    ):
        self.scheduler = scheduler
        self.query_bank = query_bank
        self.pause_wait_s = pause_wait_s

    def replay_one_query(
        self,
        start_time: float,
        next_query_start_time: Optional[float] = None,
        query_str: Optional[int] = None,
        query_idx: Optional[int] = None,
    ):
        (
            should_immediate_re_ingest,
            should_pause_and_re_ingest,
            scheduled_submit,
        ) = self.scheduler.ingest_query(
            start_time, query_str=query_str, query_idx=query_idx, simulation=True
        )
        if should_immediate_re_ingest:
            # the scheduler schedules one query at a time even if there are multiple queries in the queue,
            # so need to call again
            self.replay_one_query(start_time + 0.001, next_query_start_time)
        if should_pause_and_re_ingest:
            # this indicates it is not optimal to submit any query in the queue, will try in a future time
            if (
                next_query_start_time is not None
                and next_query_start_time <= start_time + self.pause_wait_s
            ):
                return
            self.replay_one_query(start_time + self.pause_wait_s, next_query_start_time)

    def finish_all_queries(self, last_timestamp: float):
        start_t = last_timestamp
        while len(self.scheduler.queued_queries) != 0:
            # make sure all queries are submitted
            self.replay_one_query(start_t + self.pause_wait_s, None)
            start_t += self.pause_wait_s
        # finish executing all submitted queries
        self.scheduler.finish_query_simulation(np.infty)

    def replay_workload(self, directory: str) -> Tuple[np.ndarray, np.ndarray]:
        all_raw_trace, all_trace = load_trace(directory, 8, concat=True)
        concurrency_df = create_concurrency_dataset(
            all_trace, engine=None, pre_exec_interval=200
        )
        concurrency_df = concurrency_df.sort_values(by=["start_time"], ascending=True)
        original_predictions = self.scheduler.make_original_prediction(concurrency_df)
        assert len(concurrency_df) == len(original_predictions)
        original_runtime = []
        all_start_time = concurrency_df["start_time"].values
        all_query_idx = concurrency_df["query_idx"].values
        for i in range(len(concurrency_df)):
            original_runtime.append(original_predictions[i])
            # replaying the query one-by-one
            if i < len(concurrency_df) - 1:
                next_query_start_time = all_start_time[i + 1]
            else:
                next_query_start_time = None
            self.replay_one_query(
                all_start_time[i], next_query_start_time, i, all_query_idx[i]
            )
        # finish all queries
        self.finish_all_queries(all_start_time[-1])
        new_runtime = []
        for i in range(len(concurrency_df)):
            new_runtime.append(self.scheduler.all_query_runtime[i])
        original_runtime = np.asarray(original_runtime)
        new_runtime = np.asarray(new_runtime)
        return original_runtime, new_runtime


In [26]:
import numpy as np
import copy
from typing import Optional, Tuple, List, Union, MutableMapping
from models.single.stage import SingleStage
from models.concurrency.complex_models import ConcurrentRNN
from scheduler.base_scheduler import BaseScheduler


class GreedyScheduler(BaseScheduler):
    def __init__(
        self,
        stage_model: SingleStage,
        predictor: ConcurrentRNN,
        max_concurrency_level: int = 10,
        min_concurrency_level: int = 2,
    ):
        """
        :param stage_model: prediction and featurization for a single query
        :param predictor: predict the runtime of concurrent queries
        :param max_concurrency_level: [hyperparameter] the maximal amount of concurrent queries the system can ingest,
                                      can set to a very big value if don't know how to set
        :param min_concurrency_level: [hyperparameter] not useful for greedy scheduler
        """
        super(GreedyScheduler, self).__init__(
            stage_model, predictor, max_concurrency_level, min_concurrency_level
        )

    def ingest_query(
        self,
        start_t: float,
        query_str: Optional[Union[str, int]] = None,
        query_idx: Optional[int] = None,
        simulation: bool = True,
    ) -> Tuple[bool, bool, Optional[Union[str, int]]]:
        """We work on planning the currently queued queries if query_str is None (i.e., no query submitted)"""
        self.current_time = start_t
        if simulation:
            self.finish_query_simulation()
        else:
            # adjusting the finishing time of running queries (due to error in estimation)
            for i in range(len(self.existing_finish_time)):
                randomness = np.abs(np.random.normal(2, 1))
                self.existing_finish_time[i] = max(
                    self.existing_finish_time[i], self.current_time + randomness
                )
        should_immediate_re_ingest = False
        should_pause_and_re_ingest = False
        scheduled_submit = None
        if query_str is not None:
            self.queued_queries.append(query_str)
            self.queued_queries_enter_time.append(start_t)
            query_feature = self.stage_model.featurize_online(query_idx)
            self.queued_query_features.append(query_feature)

        if len(self.queued_query_features) == 0:
            # nothing to do when there is no query in the queue
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                scheduled_submit,
            )
        if len(self.existing_finish_time) == 0:
            next_finish_idx = None
            next_finish_time = None
        else:
            next_finish_idx = np.argmin(self.existing_finish_time)
            next_finish_time = self.existing_finish_time[next_finish_idx]

        predictions, global_x, global_pre_info_length = self.predictor.online_inference(
            self.existing_query_features,
            self.existing_query_concur_features,
            self.existing_pre_info_length,
            self.queued_query_features,
            self.existing_start_time,
            start_t,
            next_finish_idx=next_finish_idx,
            next_finish_time=next_finish_time,
            get_next_finish=True,
        )

        predictions = predictions.reshape(-1).detach().numpy()
        if len(self.running_queries) == 0:
            # submit the shortest running query in queue when there is no query running
            # Todo: this is not optimal
            assert len(predictions) == 2 * len(self.queued_queries)
            predictions_query = predictions[0:-1:2]
            selected_idx = np.argmin(predictions_query)
            scheduled_submit = copy.deepcopy(self.queued_queries[selected_idx])
            self.submit_query(
                selected_idx,
                self.queued_queries[selected_idx],
                predictions_query[selected_idx],
                self.queued_query_features[selected_idx],
                start_t,
                self.queued_queries_enter_time[selected_idx],
                float(predictions_query[selected_idx]) + start_t,
                None,
                int(global_pre_info_length[selected_idx * 2]),
            )
            should_immediate_re_ingest = True
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                scheduled_submit,
            )
        elif len(self.running_queries) >= self.max_concurrency_level:
            # when the system is overloaded, should pause and retry
            should_pause_and_re_ingest = True
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                None,
            )
        else:
            all_score = []
            all_query_idx = []
            for i in range(len(self.queued_queries)):
                pred_idx = i * (2 + len(self.existing_query_concur_features))
                curr_pred = predictions[pred_idx]
                submit_after_pred = predictions[pred_idx + 1]
                # how does the predicted runtime of submitting now compare to submitting later
                curr_delta = (
                    curr_pred - submit_after_pred - (next_finish_time - self.queued_queries_enter_time[i])
                )
                old_existing_pred = np.asarray(self.existing_runtime_prediction)
                new_existing_pred = predictions[
                    (pred_idx + 2) : (
                        pred_idx + len(self.existing_query_concur_features) + 2
                    )
                ]
                # how will this query change the runtime of existing queries in the system ()
                delta = new_existing_pred - old_existing_pred
                delta_sum = np.sum(delta)
                # for every query first judge whether it is good to wait
                if curr_delta + delta_sum < 0:
                    # when the current system state benefit the current query more than
                    # this query's (probably negative) impact on the running queries
                    # more optimal to submit now than later
                    all_score.append(delta_sum + curr_delta)
                    all_query_idx.append(i)
                    # TODO: is there more clever condition?
            if len(all_score) == 0:
                should_immediate_re_ingest = False
                should_pause_and_re_ingest = False
                scheduled_submit = None
            else:
                # TODO: use linear programming rather than argmax
                best_query_idx = np.argmin(all_score)
                selected_idx = all_query_idx[best_query_idx]
                converted_idx = selected_idx * (
                    2 + len(self.existing_query_concur_features)
                )
                curr_pred_runtime = predictions[converted_idx]
                finish_t = start_t + curr_pred_runtime
                existing_query_concur_features = global_x[converted_idx]
                new_existing_pred = predictions[
                    (converted_idx + 2) : (
                        converted_idx + len(self.existing_query_concur_features) + 2
                    )
                ]
                new_existing_finish_time = []
                for i in range(len(self.existing_start_time)):
                    new_existing_finish_time.append(
                        new_existing_pred[i] + self.existing_start_time[i]
                    )
                new_existing_query_concur_feature = global_x[
                    (converted_idx + 2) : (
                        converted_idx + len(self.existing_query_concur_features) + 2
                    )
                ]
                scheduled_submit = copy.deepcopy(self.queued_queries[selected_idx])
                self.submit_query(
                    selected_idx,
                    self.queued_queries[selected_idx],
                    curr_pred_runtime,
                    self.queued_query_features[selected_idx],
                    start_t,
                    self.queued_queries_enter_time[selected_idx],
                    finish_t,
                    existing_query_concur_features,
                    int(global_pre_info_length[converted_idx]),
                    new_existing_finish_time,
                    list(new_existing_pred),
                    new_existing_query_concur_feature,
                )
                should_immediate_re_ingest = True
                should_pause_and_re_ingest = False
            return (
                should_immediate_re_ingest,
                should_pause_and_re_ingest,
                scheduled_submit,
            )


In [48]:
#parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/mixed_aurora/aurora_mixed_parsed_queries.json"
parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/mixed_postgres/postgres_mixed_parsed_queries.json"
plans = load_json(parsed_queries_path, namespace=False)
folder_name = "mixed_postgres"
directory = f"/Users/ziniuw/Desktop/research/Data/AWS_trace/{folder_name}/"
all_raw_trace, all_trace = load_trace_all_version(directory, 8, concat=True)
all_concurrency_df = []
for trace in all_trace:
    concurrency_df = create_concurrency_dataset(trace, engine=None, pre_exec_interval=200)
    all_concurrency_df.append(concurrency_df)
concurrency_df = pd.concat(all_concurrency_df, ignore_index=True)
train_trace_df_sep, eval_trace_df_sep = pre_info_train_test_seperation(concurrency_df)
print(len(train_trace_df_sep), len(eval_trace_df_sep))
np.random.seed(0)
train_idx = np.random.choice(len(concurrency_df), size=int(0.8 * len(concurrency_df)), replace=False)
test_idx = [i for i in range(len(concurrency_df)) if i not in train_idx]
train_trace_df = copy.deepcopy(concurrency_df.iloc[train_idx])
eval_trace_df = concurrency_df.iloc[test_idx]
eval_trace_df = copy.deepcopy(eval_trace_df[eval_trace_df['num_concurrent_queries'] > 0])
print(len(train_trace_df), len(eval_trace_df))

17603 13399
24889 6198


In [5]:
concurrency_df.head(5)

Unnamed: 0,index,query_idx,runtime,start_time,end_time,pre_exec_info,concur_info,num_concurrent_queries,concur_info_train,num_concurrent_queries_train
0,0,24,85.749652,0.0,85.749652,[],"[(204, 0.0, 1.0899443626403809), (202, 1.86680...",8,[],0
1,1,204,1.089944,0.0,1.089944,[],"[(24, 0.0, 85.74965190887451)]",1,"[(24, 0.0, 85.74965190887451)]",1
2,2,202,3.19667,1.866808,5.063478,"[(204, 0.0, 1.0899443626403809)]","[(24, 0.0, 85.74965190887451), (82, 2.55093400...",3,"[(24, 0.0, 85.74965190887451)]",1
3,3,82,117.75827,2.550934,120.309204,"[(204, 0.0, 1.0899443626403809)]","[(24, 0.0, 85.74965190887451), (202, 1.8668079...",11,"[(24, 0.0, 85.74965190887451), (202, 1.8668079...",2
4,4,151,3.069139,3.720243,6.789382,"[(204, 0.0, 1.0899443626403809)]","[(24, 0.0, 85.74965190887451), (202, 1.8668079...",3,"[(24, 0.0, 85.74965190887451), (202, 1.8668079...",3


In [53]:
concurrency_df = create_concurrency_dataset(all_trace[-1], engine=None, pre_exec_interval=200)
len(concurrency_df)

18545

In [55]:
ss = SingleStage(use_table_features=True, true_card=False, use_median=True)
#df = ss.featurize_data(train_trace_df, parsed_queries_path)
df = ss.featurize_data(concurrency_df, parsed_queries_path)
ss.train(df)
rnn = ConcurrentRNN(ss, 
                     "postgres",
                    input_size=len(ss.all_feature[0]) * 2 + 7,
                    embedding_dim=128,
                    hidden_size=256,
                    num_layers=2,
                    loss_function="q_loss",
                    last_output=True,
                    use_separation=False
                   )
rnn.load_model("checkpoints")

Top 20 operators contains 0.9642989959092599 total operators


In [56]:
import pickle as pkl
with open("checkpoints/postgres_stage_model.pkl", "wb") as f:
    pkl.dump(ss, f)

In [41]:
with open("checkpoints/postgres_stage_model.pkl", "rb") as f:
    a = pkl.load(f)

In [57]:
all_raw_trace, all_trace = load_trace(directory, 8, concat=True)
concurrency_df = create_concurrency_dataset(
            all_trace, engine=None, pre_exec_interval=200
)

In [58]:
scheduler = GreedyScheduler(ss, rnn)
simulator = Simulator(scheduler)
concurrency_df = concurrency_df.sort_values(by=['start_time'], ascending=True)
original_predictions = scheduler.make_original_prediction(concurrency_df)
assert len(concurrency_df) == len(original_predictions)

100%|███████████████████████████████████████████████████████████████████████████| 42/42 [00:01<00:00, 22.90it/s]

50% absolute error is 2.6314380168914795, q-error is 1.290785551071167
90% absolute error is 20.887972640991208, q-error is 3.5905029058456415
95% absolute error is 32.35556869506836, q-error is 10.403009319305417





In [29]:
scheduler = GreedyScheduler(ss, rnn)
simulator = Simulator(scheduler)
concurrency_df = concurrency_df.sort_values(by=['start_time'], ascending=True)
original_predictions = scheduler.make_original_prediction(concurrency_df)
assert len(concurrency_df) == len(original_predictions)

100%|█████████████████████████████████████████████████████████████████████████| 145/145 [00:09<00:00, 14.99it/s]

50% absolute error is 1.4944992065429688, q-error is 1.1646299362182617
90% absolute error is 10.884896850585944, q-error is 2.467008304595949
95% absolute error is 18.43571166992186, q-error is 4.286227607727048





In [30]:
original_runtime = []
all_start_time = concurrency_df["start_time"].values
all_query_idx = concurrency_df["query_idx"].values
for i in range(len(concurrency_df)):
    original_runtime.append(original_predictions[i])
    # replaying the query one-by-one
    if i < len(concurrency_df) - 1:
        next_query_start_time = all_start_time[i + 1]
    else:
        next_query_start_time = None
    print("==============================", i, original_predictions[i])
    simulator.replay_one_query(all_start_time[i], next_query_start_time, i, all_query_idx[i])
    scheduler.print_state()
simulator.finish_all_queries(all_start_time[-1])

current time:  0.001
running_queries:  [(0, 7.6468697)]
queued_queries:  []
current time:  0.001
running_queries:  [(0, 2.7874904), (1, 2.3108683)]
queued_queries:  []
current time:  0.0
running_queries:  [(0, 2.7874904), (1, 2.3108683)]
queued_queries:  [2]
current time:  4.846147000000001
running_queries:  [(2, 3.696238), (3, 3.7229943)]
queued_queries:  []
current time:  10.585227999999999
running_queries:  [(4, 17.871504)]
queued_queries:  []
current time:  12.759001999999997
running_queries:  [(4, 8.068478), (5, 4.064131)]
queued_queries:  []
current time:  12.773885
running_queries:  [(4, 8.068478), (5, 4.064131)]
queued_queries:  [6]
current time:  12.873694999999998
running_queries:  [(4, 8.068478), (5, 4.064131)]
queued_queries:  [6, 7]
current time:  13.483902999999998
running_queries:  [(4, 8.068478), (5, 4.064131)]
queued_queries:  [6, 7, 8]
current time:  18.488340000000004
running_queries:  [(4, 24.824223), (9, 22.843575), (6, 6.333028), (7, 14.955166)]
queued_queries:  [

TypeError: BaseScheduler.finish_query() missing 1 required positional argument: 'query_str'

In [31]:
simulator.finish_all_queries(all_start_time[-1])
len(original_runtime), len(concurrency_df), len(simulator.scheduler.all_query_runtime)

(18545, 18545, 18545)

In [32]:
new_runtime = []
for i in range(len(concurrency_df)):
    new_runtime.append(simulator.scheduler.all_query_runtime[i])
original_runtime = np.asarray(original_runtime)
new_runtime = np.asarray(new_runtime)

In [35]:
rt = concurrency_df['runtime'].values[:7242]

In [33]:
np.mean(original_runtime), np.percentile(original_runtime, 50), np.percentile(original_runtime, 90), np.percentile(original_runtime, 95), np.percentile(original_runtime, 99)


(59.012215,
 8.142634391784668,
 140.9555450439454,
 232.6310485839842,
 999.8841186523438)

In [34]:
np.mean(new_runtime), np.percentile(new_runtime, 50), np.percentile(new_runtime, 90), np.percentile(new_runtime, 95), np.percentile(new_runtime, 99)


(11.53676775879018,
 7.391608238220215,
 24.627706767502257,
 32.93593210519175,
 56.83942967302437)

In [36]:
np.mean(rt), np.percentile(rt, 50), np.percentile(rt, 90), np.percentile(rt, 95), np.percentile(rt, 99)

(83.23669800103879,
 10.384392738342285,
 184.4025166749955,
 379.0671509861943,
 1000.0322662138939)

In [None]:
simulator.replay_one_query(1.87, 2.76, 143, 143)
scheduler.print_state()

In [17]:
a = torch.zeros((3, 4))

In [18]:
a[0,1] = 1

In [19]:
a

tensor([[0., 1., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [42]:
ss

<models.single.stage.SingleStage at 0x2ea2fdb90>

In [87]:
rt = np.load("checkpoints/timeout_200_sys_exec_time_baseline.npy")
e2e_rt = np.load("checkpoints/timeout_200_e2e_runtime_baseline.npy")
ours_rt = np.load("checkpoints/timeout_200_sys_exec_time_ours.npy")
ours_e2e = np.load("checkpoints/timeout_200_e2e_runtime_ours.npy")

In [92]:
def report_performance(rt, idx):
    runtime = rt[idx]
    print(np.mean(runtime), np.percentile(runtime, 50), np.percentile(runtime, 90), np.percentile(runtime, 95), np.percentile(runtime, 99))

In [93]:
idx = np.where(ours_e2e > 0)[0]
report_performance(rt, idx)
report_performance(e2e_rt, idx)
report_performance(ours_rt, idx)
report_performance(ours_e2e, idx)

24.579775537304606 5.487337112426758 74.3676296234131 88.43337535858151 189.5138358688356
27.824789178090466 9.015914916992188 77.419335269928 91.70076365470884 192.7225460815431
18.856006538827813 6.236668825149536 55.82426891326908 67.88316702842712 92.04459434509285
45.63572805899161 33.19993305206299 98.33843498229982 118.80950570106505 205.61918326377923


In [78]:
np.mean(original_runtime), np.percentile(original_runtime, 50), np.percentile(original_runtime, 90), np.percentile(original_runtime, 95), np.percentile(original_runtime, 99)

(500, 500)

In [94]:
np.stack((rt[0:100], e2e_rt[0:100], ours_rt[0:100], ours_e2e[0:100]), axis=1)

array([[ 73.77101207,  77.15674686,  65.77072001,  69.21388006],
       [ 41.61805916,  44.58631682,  13.1934011 ,  17.059829  ],
       [ 14.49293399,  17.53258801,   6.2059052 ,  27.10757184],
       [154.08231282, 157.3001411 ,  72.50206995, 145.47488999],
       [ 14.30631018,  17.03120804,   6.04234123,  26.095752  ],
       [ 82.27517271,  85.17092824,  62.05491114,  79.24143672],
       [  8.91979384,  12.02127481,   8.21936703,  23.08981514],
       [  8.42301416,  12.02406192,  14.55130291,  51.1996901 ],
       [ 41.049613  ,  44.57659006,  17.84039807,  91.39779115],
       [  3.892869  ,   7.01349235,   5.12024808,   8.55051804],
       [  0.70403409,   4.00736213,   0.65989995,  10.56943083],
       [196.17296886, 199.38982201,  66.60334802, 127.49797702],
       [ 62.08480692,  65.61422682,  51.75367308,  56.1910162 ],
       [ 20.77441287,  24.54313993,   2.29508686,  30.66539001],
       [  1.22894287,   4.50839901,   1.11718798,  34.72699499],
       [  0.70410991,   4

In [66]:
for i in range(100):
    print(i, concurrency_df["start_time"].iloc[i], concurrency_df["runtime"].iloc[i], concurrency_df["query_idx"].iloc[i])

0 0.0 85.74965190887451 24
1 0.0 1.0899443626403809 204
2 1.8668079999999998 3.196669578552246 202
3 2.5509340000000003 117.7582700252533 82
4 3.720243 3.069139242172241 151
5 6.817359 79.92487502098083 73
6 8.744517 53.67154026031494 26
7 62.444424000000005 13.643858432769775 175
8 80.27600000000001 59.582693338394165 204
9 87.15794100000001 4.579750299453735 182
10 88.895815 0.1039962768554687 9
11 93.130002 245.94434309005737 22
12 94.460391 69.64579319953918 139
13 125.770718 13.8380286693573 8
14 142.033315 0.2684483528137207 199
15 149.52092 0.2201185226440429 8
16 152.73743399999998 47.63173699378967 181
17 156.388151 0.3939771652221679 215
18 158.105023 168.15277552604675 45
19 168.213426 84.61090850830078 5
20 207.050728 10.514808654785156 196
21 220.99726399999997 16.878137350082397 109
22 245.538717 3.734508991241455 174
23 252.624494 46.119447231292725 38
24 252.852355 2.331519842147827 218
25 259.271346 102.5082676410675 22
26 301.10056000000003 113.13597345352171 75
27 33

In [63]:
for i, rows in concurrency_df.groupby("query_idx"):
    runtime = rows["runtime"].values
    print(i, len(rows), np.mean(runtime), np.min(runtime), np.max(runtime), np.std(runtime))

0 23 6.9897928134254785 1.7361910343170166 46.89248514175415 9.473017881796181
1 21 0.042800471896216954 0.0306851863861084 0.064239501953125 0.008890692266292256
2 25 77.7485980606079 36.011961460113525 176.65631198883057 32.84361507978263
3 21 723.5043605055128 514.6219673156738 996.0332036018372 99.72439938805589
4 23 183.26921186239824 95.24777936935423 322.1603753566742 57.0662497887613
5 21 72.08878095944722 24.272745609283447 158.4005913734436 29.339563577499423
6 22 103.16899719021536 31.87341856956482 219.7085883617401 39.49215570636949
7 22 0.9977116801521995 0.5253846645355225 3.6383426189422607 0.6118403643045283
8 23 5.961048416469408 0.2201185226440429 27.26981830596924 6.273220019590701
9 22 0.1255583979866721 0.0557055473327636 0.851060152053833 0.1615357087075341
10 21 0.2136202653249105 0.0695333480834961 0.7829084396362305 0.15434167068471696
11 23 16.184140288311504 3.430562973022461 45.53382992744446 11.56589882122229
12 22 97.93932069431652 32.612547636032104 207.

In [64]:
concurrency_df.head(4)

Unnamed: 0,index,query_idx,runtime,start_time,end_time,pre_exec_info,concur_info,num_concurrent_queries,concur_info_train,num_concurrent_queries_train
0,0,24,85.749652,0.0,85.749652,[],"[(204, 0.0, 1.0899443626403809), (202, 1.86680...",8,[],0
1,1,204,1.089944,0.0,1.089944,[],"[(24, 0.0, 85.74965190887451)]",1,"[(24, 0.0, 85.74965190887451)]",1
2,2,202,3.19667,1.866808,5.063478,"[(204, 0.0, 1.0899443626403809)]","[(24, 0.0, 85.74965190887451), (82, 2.55093400...",3,"[(24, 0.0, 85.74965190887451)]",1
3,3,82,117.75827,2.550934,120.309204,"[(204, 0.0, 1.0899443626403809)]","[(24, 0.0, 85.74965190887451), (202, 1.8668079...",11,"[(24, 0.0, 85.74965190887451), (202, 1.8668079...",2
