In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import copy
import sys
import os
sys.path.append("../")
from parser.utils import load_json, dfs_cardinality, estimate_scan_in_mb
from models.feature.single_xgboost_feature import find_top_k_operators, featurize_one_plan, get_top_k_table_by_size
from utils.load_brad_trace import load_trace, create_concurrency_dataset, load_trace_all_version
from models.concurrency.utils import pre_info_train_test_seperation
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from models.single.stage import SingleStage
from models.concurrency.complex_models import ConcurrentRNN
from scheduler.greedy_scheduler import GreedyScheduler
from scheduler.linear_programming_scheduler import LPScheduler
from simulator.simulator import Simulator

np.set_printoptions(precision=4)

In [2]:
parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/mixed_aurora/aurora_mixed_parsed_queries.json"
#parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/mixed_postgres/postgres_mixed_parsed_queries.json"
plans = load_json(parsed_queries_path, namespace=False)
folder_name = "mixed_redshift"
directory = f"/Users/ziniuw/Desktop/research/Data/AWS_trace/{folder_name}/"
all_raw_trace, all_trace = load_trace_all_version(directory, 8, concat=True)
all_concurrency_df = []
for trace in all_trace:
    concurrency_df = create_concurrency_dataset(trace, engine=None, pre_exec_interval=200)
    all_concurrency_df.append(concurrency_df)
concurrency_df = pd.concat(all_concurrency_df, ignore_index=True)
train_trace_df_sep, eval_trace_df_sep = pre_info_train_test_seperation(concurrency_df)
print(len(train_trace_df_sep), len(eval_trace_df_sep))
np.random.seed(0)
train_idx = np.random.choice(len(concurrency_df), size=int(0.8 * len(concurrency_df)), replace=False)
test_idx = [i for i in range(len(concurrency_df)) if i not in train_idx]
train_trace_df = copy.deepcopy(concurrency_df.iloc[train_idx])
eval_trace_df = concurrency_df.iloc[test_idx]
eval_trace_df = copy.deepcopy(eval_trace_df[eval_trace_df['num_concurrent_queries'] > 0])
print(len(train_trace_df), len(eval_trace_df))

24263 31762
51063 11219


In [3]:
parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/long_redshift/mixed_redshift_parsed_queries.json"
#parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/mixed_postgres/postgres_mixed_parsed_queries.json"
plans = load_json(parsed_queries_path, namespace=False)

directory = "saved_results/redshift/"
all_trace = []
for file in os.listdir(directory):
    if file.endswith(".csv"):
        print(file)
        file = os.path.join(directory, file)
        df = pd.read_csv(file)
        df = df[df['run_time_s'] > 0]
        all_trace.append(df)
all_concurrency_df = []
for trace in all_trace:
    concurrency_df = create_concurrency_dataset(trace, engine=None, pre_exec_interval=200)
    all_concurrency_df.append(concurrency_df)
concurrency_df = pd.concat(all_concurrency_df, ignore_index=True)

timeout_600_num_clients10_baseline.csv
timeout_600_num_clients4_baseline.csv
timeout_600_num_clients8_baseline.csv
timeout_1000_num_clients10_baseline.csv
timeout_1000_num_clients8_baseline.csv
timeout_600_num_clients14_baseline.csv
timeout_1000_num_clients2_baseline.csv
timeout_600_num_clients12_baseline.csv
timeout_600_num_clients6_baseline.csv


In [4]:
concurrency_df = create_concurrency_dataset(all_trace[-1], engine=None, pre_exec_interval=200)
len(concurrency_df)

482

In [5]:
ss = SingleStage(use_table_features=True, true_card=False, use_median=True)
#df = ss.featurize_data(train_trace_df, parsed_queries_path)
df = ss.featurize_data(concurrency_df, parsed_queries_path)
ss.train(df)
rnn = ConcurrentRNN(ss, 
                     "redshift",
                    input_size=len(ss.all_feature[0]) * 2 + 7,
                    embedding_dim=128,
                    hidden_size=256,
                    num_layers=2,
                    loss_function="q_loss",
                    last_output=True,
                    use_separation=False
                   )
rnn.load_model("checkpoints")

Top 20 operators contains 0.9810270680495826 total operators


In [5]:
import pickle as pkl
with open("checkpoints/redshift_stage_model.pkl", "wb") as f:
    pkl.dump(ss, f)

In [41]:
with open("checkpoints/postgres_stage_model.pkl", "rb") as f:
    a = pkl.load(f)

In [27]:
scheduler = GreedyScheduler(ss, rnn, alpha=0.1)
simulator = Simulator(scheduler)
concurrency_df = concurrency_df.sort_values(by=['start_time'], ascending=True)
original_predictions = scheduler.make_original_prediction(concurrency_df)
assert len(concurrency_df) == len(original_predictions)

100%|█████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 19.87it/s]

50% absolute error is 3.471813440322876, q-error is 1.1254734992980957
90% absolute error is 25.56133728027346, q-error is 1.7138597607612611
95% absolute error is 35.291097259521464, q-error is 2.1185851097106934





In [28]:
original_runtime = []
all_start_time = concurrency_df["start_time"].values
all_query_idx = concurrency_df["query_idx"].values
for i in range(len(concurrency_df)):
    original_runtime.append(original_predictions[i])
    # replaying the query one-by-one
    if i < len(concurrency_df) - 1:
        next_query_start_time = all_start_time[i + 1]
    else:
        next_query_start_time = None
    #print("==============================", i, original_predictions[i])
    simulator.replay_one_query(all_start_time[i], next_query_start_time, i, all_query_idx[i])
    #scheduler.print_state()
simulator.finish_all_queries(all_start_time[-1])

In [29]:
len(original_runtime), len(concurrency_df), len(simulator.scheduler.all_query_runtime)

(482, 482, 482)

In [30]:
new_runtime = []
for i in range(len(concurrency_df)):
    new_runtime.append(simulator.scheduler.all_query_runtime[i])
original_runtime = np.asarray(original_runtime)
new_runtime = np.asarray(new_runtime)

In [31]:
rt = concurrency_df['runtime'].values

In [32]:
np.mean(original_runtime), np.percentile(original_runtime, 50), np.percentile(original_runtime, 90), np.percentile(original_runtime, 95), np.percentile(original_runtime, 99)


(89.35122,
 49.377092361450195,
 244.20334777832045,
 371.9452499389648,
 431.8284017944336)

In [33]:
np.mean(new_runtime), np.percentile(new_runtime, 50), np.percentile(new_runtime, 90), np.percentile(new_runtime, 95), np.percentile(new_runtime, 99)


(38.092139970644915,
 15.542807579040527,
 104.50299148559571,
 135.61857773427965,
 207.95546186119094)

In [12]:
np.mean(new_runtime), np.percentile(new_runtime, 50), np.percentile(new_runtime, 90), np.percentile(new_runtime, 95), np.percentile(new_runtime, 99)


(28.082790672207317,
 10.425809383392334,
 82.27797918415067,
 113.082476568222,
 159.82312020009996)

In [13]:
np.mean(rt), np.percentile(rt, 50), np.percentile(rt, 90), np.percentile(rt, 95), np.percentile(rt, 99)

(87.61418041599242,
 51.053452134132385,
 222.21745131015774,
 370.2183699250221,
 424.97391885995864)

In [None]:
simulator.replay_one_query(1.87, 2.76, 143, 143)
scheduler.print_state()

In [17]:
a = torch.zeros((3, 4))

In [18]:
a[0,1] = 1

In [19]:
a

tensor([[0., 1., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [42]:
ss

<models.single.stage.SingleStage at 0x2ea2fdb90>

In [62]:
#rt = np.load("checkpoints/previous_runs/timeout_1000_sys_exec_time_baseline.npy")
#e2e_rt = np.load("checkpoints/previous_runs/timeout_1000_e2e_runtime_baseline.npy")
rt = np.load("checkpoints/timeout_1000_sys_exec_time_baseline.npy")
e2e_rt = np.load("checkpoints/timeout_1000_e2e_runtime_baseline.npy")
#ours_rt = np.load("checkpoints/previous_runs/timeout_1000_sys_exec_time_ours.npy")
#ours_e2e = np.load("checkpoints/previous_runs/timeout_1000_e2e_runtime_ours.npy")
ours_rt = np.load("checkpoints/timeout_200_sys_exec_time_ours.npy")
ours_e2e = np.load("checkpoints/timeout_200_e2e_runtime_ours.npy")
all_raw_trace, all_trace = load_trace(directory, 8, concat=True)
concurrency_df = create_concurrency_dataset(
            all_trace, engine=None, pre_exec_interval=200
)
concurrency_df = concurrency_df.sort_values(by=['start_time'], ascending=True)
concurrency_df_rt = concurrency_df["runtime"].values

In [63]:
len(ours_rt), len(ours_e2e), len(rt)

(900, 900, 700)

In [64]:
def report_performance(rt, idx=None):
    if idx is None:
        runtime = rt
    else:
        runtime = rt[idx]
    print(np.mean(runtime), np.percentile(runtime, 50), np.percentile(runtime, 90), np.percentile(runtime, 95), np.percentile(runtime, 99))

In [65]:
report_performance(rt, None)

100.42410005875996 25.405025482177734 308.23970189094547 424.39539455175395 799.4055482363696


In [67]:
np.sum(e2e_rt > 200), np.sum(ours_rt > 190)

(133, 42)

In [68]:
idx = np.where(ours_e2e[:700] > 0)[0]
report_performance(rt, idx)
report_performance(e2e_rt, idx)
report_performance(ours_rt, idx)
report_performance(ours_e2e, idx)
report_performance(concurrency_df_rt, idx)

100.42410005875996 25.405025482177734 308.23970189094547 424.39539455175395 799.4055482363696
103.85096815313612 28.825364112854004 311.58910791873933 428.0414997935294 802.2661592268938
44.59993027687073 15.497947931289673 143.55612349510193 191.2482687950134 201.40416612148286
86.18284460272108 41.43850064277649 238.70695281028748 307.07629054784775 438.24097260713575
3.7934531685284205 0.723724365234375 5.894774651527405 14.9151360273361 60.87397601127617


In [39]:
idx = np.where(ours_e2e[:400] > 0)[0]
report_performance(rt, idx)
report_performance(e2e_rt, idx)
report_performance(ours_rt, idx)
report_performance(ours_e2e, idx)
report_performance(concurrency_df_rt, idx)

109.33521686077118 29.615013480186462 317.6123276472093 468.07558170556985 972.4766488862037
109.788342461586 30.061041951179504 318.1608426094057 468.7256953954693 972.7789209103584
23.715621346235274 6.375097990036011 62.67300522327424 87.36640502214429 200.70521162986756
37.25047614634037 9.311975121498108 101.31462357044221 147.93589364290148 244.3785142660141
3.477743474841118 0.6712734699249268 5.001080036163333 10.581547439098308 60.15818839788426


In [95]:
start = 0
end = 100
np.stack((np.arange(end-start) + start, rt[start:end], e2e_rt[start:end], ours_rt[start:end], ours_e2e[start:end], concurrency_df_rt[start:end]), axis=1)

array([[  0.    ,  73.771 ,  77.1567,  36.3439,  36.6396,  85.7497],
       [  1.    ,  41.6181,  44.5863,   9.7964,  10.0411,   1.0899],
       [  2.    ,  14.4929,  17.5326,   9.2689,  19.5902,   3.1967],
       [  3.    , 154.0823, 157.3001,  55.8749,  66.2013, 117.7583],
       [  4.    ,  14.3063,  17.0312,  11.2488,  21.5957,   3.0691],
       [  5.    ,  82.2752,  85.1709,  33.3002,  68.7077,  79.9249],
       [  6.    ,   8.9198,  12.0213,   5.6296,  12.554 ,  53.6715],
       [  7.    ,   8.423 ,  12.0241,   1.194 ,  14.5462,  13.6439],
       [  8.    ,  41.0496,  44.5766,   3.6955,   4.0118,  59.5827],
       [  9.    ,   3.8929,   7.0135,   2.5508,   3.015 ,   4.5798],
       [ 10.    ,   0.704 ,   4.0074,   0.126 ,   0.5054,   0.104 ],
       [ 11.    , 196.173 , 199.3898,  71.7047,  72.1904, 245.9443],
       [ 12.    ,  62.0848,  65.6142,  31.5491,  32.0773,  69.6458],
       [ 13.    ,  20.7744,  24.5431,   3.5321,  43.6573,  13.838 ],
       [ 14.    ,   1.2289,   4.50

In [66]:
for i in range(100):
    print(i, concurrency_df["start_time"].iloc[i], concurrency_df["runtime"].iloc[i], concurrency_df["query_idx"].iloc[i])

0 0.0 85.74965190887451 24
1 0.0 1.0899443626403809 204
2 1.8668079999999998 3.196669578552246 202
3 2.5509340000000003 117.7582700252533 82
4 3.720243 3.069139242172241 151
5 6.817359 79.92487502098083 73
6 8.744517 53.67154026031494 26
7 62.444424000000005 13.643858432769775 175
8 80.27600000000001 59.582693338394165 204
9 87.15794100000001 4.579750299453735 182
10 88.895815 0.1039962768554687 9
11 93.130002 245.94434309005737 22
12 94.460391 69.64579319953918 139
13 125.770718 13.8380286693573 8
14 142.033315 0.2684483528137207 199
15 149.52092 0.2201185226440429 8
16 152.73743399999998 47.63173699378967 181
17 156.388151 0.3939771652221679 215
18 158.105023 168.15277552604675 45
19 168.213426 84.61090850830078 5
20 207.050728 10.514808654785156 196
21 220.99726399999997 16.878137350082397 109
22 245.538717 3.734508991241455 174
23 252.624494 46.119447231292725 38
24 252.852355 2.331519842147827 218
25 259.271346 102.5082676410675 22
26 301.10056000000003 113.13597345352171 75
27 33

In [63]:
for i, rows in concurrency_df.groupby("query_idx"):
    runtime = rows["runtime"].values
    print(i, len(rows), np.mean(runtime), np.min(runtime), np.max(runtime), np.std(runtime))

0 23 6.9897928134254785 1.7361910343170166 46.89248514175415 9.473017881796181
1 21 0.042800471896216954 0.0306851863861084 0.064239501953125 0.008890692266292256
2 25 77.7485980606079 36.011961460113525 176.65631198883057 32.84361507978263
3 21 723.5043605055128 514.6219673156738 996.0332036018372 99.72439938805589
4 23 183.26921186239824 95.24777936935423 322.1603753566742 57.0662497887613
5 21 72.08878095944722 24.272745609283447 158.4005913734436 29.339563577499423
6 22 103.16899719021536 31.87341856956482 219.7085883617401 39.49215570636949
7 22 0.9977116801521995 0.5253846645355225 3.6383426189422607 0.6118403643045283
8 23 5.961048416469408 0.2201185226440429 27.26981830596924 6.273220019590701
9 22 0.1255583979866721 0.0557055473327636 0.851060152053833 0.1615357087075341
10 21 0.2136202653249105 0.0695333480834961 0.7829084396362305 0.15434167068471696
11 23 16.184140288311504 3.430562973022461 45.53382992744446 11.56589882122229
12 22 97.93932069431652 32.612547636032104 207.

In [64]:
concurrency_df.head(4)

Unnamed: 0,index,query_idx,runtime,start_time,end_time,pre_exec_info,concur_info,num_concurrent_queries,concur_info_train,num_concurrent_queries_train
0,0,24,85.749652,0.0,85.749652,[],"[(204, 0.0, 1.0899443626403809), (202, 1.86680...",8,[],0
1,1,204,1.089944,0.0,1.089944,[],"[(24, 0.0, 85.74965190887451)]",1,"[(24, 0.0, 85.74965190887451)]",1
2,2,202,3.19667,1.866808,5.063478,"[(204, 0.0, 1.0899443626403809)]","[(24, 0.0, 85.74965190887451), (82, 2.55093400...",3,"[(24, 0.0, 85.74965190887451)]",1
3,3,82,117.75827,2.550934,120.309204,"[(204, 0.0, 1.0899443626403809)]","[(24, 0.0, 85.74965190887451), (202, 1.8668079...",11,"[(24, 0.0, 85.74965190887451), (202, 1.8668079...",2
