In [1]:
import os
import sys

ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
EXPERIMENT_PATH = f"{ROOT_PATH}/experiments/prediction-modes"
sys.path.insert(0, ROOT_PATH)

In [2]:
from json import load, dumps, dump
from collections import defaultdict
from src.datasets.data_config import DOPS, DEFAULT_HINTSET, HINTSETS
from src.datasets.oracle import Oracle, OracleRequest, TIMEOUT

In [3]:
# other benchmarks don't have more than 1 query inside one template 
oracle = Oracle(f"{ROOT_PATH}/data/processed/JOB")

In [4]:
DEFAULT_DOP = 64
TIMEOUT_REL = 2

# Is the template or plan enough to safely estimate the usefullness of hintset?

Here we will check the deviations of execution time inside the group of queries under different hintsets. \
Considered groupping functions are:
- query $\rightarrow$ query template, 
- query $\rightarrow$ logical plan, and
- query $\rightarrow$ full plan (i.e. logical plan with estimated cardinalities)

For simplicity, we restrict ourselves to groups within which a) a hintset causes some queries to speed up and some to `T/O`, and b) all queries are sufficiently long (> 1 sec), 

In [5]:
def make_group_summary(query_groups, only_interesting=True):
    """
    In each query group it looks for cases, where hintset for one queries lead to boost 
    and for another one lead to degradation, and collects info about the most interesting ones.
    """
    group_summaries = []
    for query_group in query_groups:   
        min_timeouts = 0
        hs_to_show, def_times_to_show, cust_times_to_show, boosts_to_show = None, None, None, None
        
        for hs in HINTSETS:
            def_times, cust_times, speedups, boosts = [], [], [], []
            for q_n in query_group:
                cust_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP))
                def_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP))
                
                def_times.append(def_time)
                cust_times.append(cust_time)
                speedups.append(def_time - cust_time)

                if cust_time >= TIMEOUT:
                    boosts.append(-TIMEOUT_REL)
                elif cust_time > def_time:
                    boosts.append(-cust_time / def_time)
                else:
                    boosts.append(def_time / cust_time)

            are_queries_small = max(def_times) < 1000
            if only_interesting and are_queries_small:
               continue
            
            n_timeouts = sum([boost == -TIMEOUT_REL for boost in boosts])
            new_n_timeouts_record = (n_timeouts > min_timeouts)
            n_min_timeouts = (n_timeouts == min_timeouts)
            new_boost_record = (hs_to_show is None or max(boosts_to_show) <= max(boosts))
            boost_is_large_enough = (max(boosts) > 1.01)

            if (new_n_timeouts_record or (n_min_timeouts and new_boost_record)) and boost_is_large_enough:
                min_timeouts = n_timeouts
                hs_to_show = hs
                def_times_to_show = def_times
                cust_times_to_show = cust_times
                boosts_to_show = boosts
                speedups_to_show = speedups
            
        is_interesting_case_found = (hs_to_show is not None and len(query_group) > 1)
        if is_interesting_case_found:
            summary = (query_group, hs_to_show, speedups_to_show, def_times_to_show, cust_times_to_show)
            group_summaries.append(summary)
            
    return group_summaries

In [6]:
def pretty_print_summary(query_group, hs, def_times, cust_times):
    import numpy as np
    q_to_def_times, q_to_cust_times = defaultdict(list), defaultdict(list)
    for q_n, def_time, cust_time in sorted(zip(query_group, def_times, cust_times)):
        q_to_def_times[q_n].append(def_time)
        q_to_cust_times[q_n].append(cust_time)

    query_col, def_time_col, cust_time_col, boost_col = [], [], [], []
    for q_n in sorted(set(query_group)):
        query_col.append(f"{q_n}")
        def_time = np.mean(q_to_def_times[q_n])
        def_time_col.append(f"{def_time / 1000:0.2f}s")
        
        cust_time = np.mean(q_to_cust_times[q_n])
        if cust_time >= TIMEOUT:
            cust_time_col.append("`T\\O`")
        else:
            cust_time_col.append(f"{cust_time / 1000:0.2f}s")

        if cust_time >= TIMEOUT:
            boost_col.append("`NaN`")
        elif cust_time < def_time:
            boost_col.append(f"{def_time / cust_time:0.2f}")
        else:
            boost_col.append(f"{-cust_time / def_time:0.2f}")
    
    print(f"| {'{'+', '.join(query_col)+'}'} | # {hs} |{', '.join(def_time_col)} | {', '.join(cust_time_col)}  | {', '.join(boost_col)} |")    

In [7]:
from src.utils import get_template_id, get_full_plan, get_logical_plan

In [8]:
template_to_queries = defaultdict(list)
logical_plan_to_queries = defaultdict(list)
full_plan_to_queries = defaultdict(list)

for query_name in oracle.get_query_names():
    kwargs = {
        "query_name": query_name,
        "oracle": oracle,
        "hintset": DEFAULT_HINTSET,
        "dop": DEFAULT_DOP
    }
    template_id = get_template_id(**kwargs)
    logical_plan = get_logical_plan(**kwargs)
    full_plan = get_full_plan(**kwargs)

    logical_plan_to_queries[logical_plan].append(query_name)
    template_to_queries[template_id].append(query_name)
    full_plan_to_queries[full_plan].append(query_name)

### query $\rightarrow$ query template

In [9]:
n_big_groups = 0
for group in template_to_queries.values():
    n_big_groups += len(group) > 1
print(n_big_groups)

11


Although the queries from `JOB` with the same number look extremely similar, they differ in the form in which the predicate is written. For example, queries `6a` and `6b` differ in that in one case the condition is written like `col in ['x', 'y']`, and in the other case `col in ['x']`, but it is written as `col = 'x'`. This leads to the fact that they have different template numbers. In fact, the results of template prediction (taking into account such effects) will **be even less stable** than those of logic tree prediction. So they can be omitted.

In [10]:
# template_summary = make_group_summary(query_groups=template_to_queries.values())
# template_summary = sorted(template_summary, key=lambda el: max(el[2]) - min(el[2]), reverse=True)
# for (query_group, hs, speedups, boosts, def_times, cust_times) in template_summary:
#     print(f"GROUP {query_group} WITH HINSET #{hs_to_show} AND SPEEDUP VARIATION {(max(speedups) - min(speedups))/1000:0.1f}s")
#     pretty_print_summary(query_group, hs, def_times, cust_times)
#     print("-" * 60)

### query $\rightarrow$ logical plan

In [11]:
logical_plan_summary = make_group_summary(query_groups=logical_plan_to_queries.values())
logical_plan_summary = sorted(logical_plan_summary, key=lambda el: max(el[2]) - min(el[2]), reverse=True)

for (query_group, hs, speedups, def_times, cust_times) in logical_plan_summary[:5]:
    print(f"GROUP {query_group} WITH HINSET #{hs} AND SPEEDUP VARIATION {(max(speedups) - min(speedups))/1000:0.1f}s")
    pretty_print_summary(query_group, hs, def_times, cust_times)
    print("-" * 60)

GROUP ['16d', '16c', '16b'] WITH HINSET #110 AND SPEEDUP VARIATION 4398046547.2s
| {16b, 16c, 16d} | # 110 |69.60s, 5.38s, 4.17s | 29.38s, `T\O`, `T\O`  | 2.37, `NaN`, `NaN` |
------------------------------------------------------------
GROUP ['30c', '30a'] WITH HINSET #47 AND SPEEDUP VARIATION 4398046517.6s
| {30a, 30c} | # 47 |7.58s, 14.03s | `T\O`, 0.00s  | `NaN`, 24968.09 |
------------------------------------------------------------
GROUP ['6a', '6b', '6c', '6d', '6e'] WITH HINSET #126 AND SPEEDUP VARIATION 4398046516.6s
| {6a, 6b, 6c, 6d, 6e} | # 126 |0.05s, 14.53s, 0.05s, 14.36s, 0.05s | `T\O`, 8.95s, `T\O`, `T\O`, `T\O`  | `NaN`, 1.62, `NaN`, `NaN`, `NaN` |
------------------------------------------------------------
GROUP ['5a', '5b'] WITH HINSET #95 AND SPEEDUP VARIATION 4398046510.4s
| {5a, 5b} | # 95 |1.04s, 0.97s | `T\O`, 0.66s  | `NaN`, 1.46 |
------------------------------------------------------------
GROUP ['17c', '17b', '17d', '17f'] WITH HINSET #81 AND SPEEDUP VARIAT

| Query Group| Hintset \# | Default Time| Custom Time       | Relative Boost|
|------------|-------------|-----------------|--------------------------------|--------------------------------|
| {16b, 16c, 16d} | # 110 |69.60s, 5.38s, 4.17s | 29.38s, `T\O`, `T\O`  | 2.37, `NaN`, `NaN` |
| {30a, 30c} | # 47 |7.58s, 14.03s | `T\O`, 0.00s  | `NaN`, 24968.09 |
| {6a, 6b, 6c, 6d, 6e} | # 126 |0.05s, 14.53s, 0.05s, 14.36s, 0.05s | `T\O`, 8.95s, `T\O`, `T\O`, `T\O`  | `NaN`, 1.62, `NaN`, `NaN`, `NaN` |
| {5a, 5b} | # 95 |1.04s, 0.97s | `T\O`, 0.66s  | `NaN`, 1.46 |
| {17b, 17c, 17d, 17f} | # 81 |10.47s, 10.08s, 9.98s, 15.92s | `T\O`, `T\O`, `T\O`, 15.35s  | `NaN`, `NaN`, `NaN`, 1.04 |


i.e. logical based predictions are also **unreliable**. 

*P.S. one more interesting case:*

```
GROUP ['6a', '6b', '6c', '6d', '6e'] | HINSET #106
Boosts: [-2.0, 3.073356346209928, -2.0, 2.861518916316778, -2.0]
Default Times [53.358, 14532.848, 52.611, 14361.208, 52.286]
```

That is, even though some hintsets cause degradation, there is still a hintset (`#106`)that speeds up the two largest queries (`6b`, `6d`) in the group at the same time, which is probably better than the default behavior on average (since the other queries are too small, and slowing them down may be justified).

### query $\rightarrow$ full plan

In [12]:
full_plan_summary = make_group_summary(query_groups=full_plan_to_queries.values(), only_interesting=False)
full_plan_summary = sorted(full_plan_summary, key=lambda el: max(el[2]) - min(el[2]), reverse=True)
for (query_group, hs, speedups, def_times, cust_times) in full_plan_summary:
    print(f"GROUP {query_group} WITH HINSET #{hs} AND SPEEDUP VARIATION {(max(speedups) - min(speedups))/1000:0.1f}s")
    pretty_print_summary(query_group, hs, def_times, cust_times)
    print("-" * 60)

GROUP ['6b', '6d'] WITH HINSET #126 AND SPEEDUP VARIATION 4398046502.3s
| {6b, 6d} | # 126 |14.53s, 14.36s | 8.95s, `T\O`  | 1.62, `NaN` |
------------------------------------------------------------
GROUP ['8d', '8c'] WITH HINSET #110 AND SPEEDUP VARIATION 0.6s
| {8c, 8d} | # 110 |10.83s, 6.46s | 10.71s, 6.91s  | 1.01, -1.07 |
------------------------------------------------------------
GROUP ['17c', '17d'] WITH HINSET #120 AND SPEEDUP VARIATION 0.5s
| {17c, 17d} | # 120 |10.08s, 9.98s | 9.58s, 9.95s  | 1.05, 1.00 |
------------------------------------------------------------
GROUP ['21a', '21b'] WITH HINSET #86 AND SPEEDUP VARIATION 0.0s
| {21a, 21b} | # 86 |0.13s, 0.10s | 0.11s, 0.12s  | 1.15, -1.22 |
------------------------------------------------------------
GROUP ['6a', '6c', '6e'] WITH HINSET #86 AND SPEEDUP VARIATION 0.0s
| {6a, 6c, 6e} | # 86 |0.05s, 0.05s, 0.05s | 0.06s, 0.04s, 0.05s  | -1.06, 1.26, -1.03 |
------------------------------------------------------------


| Query Group| Hintset \# | Default Time| Custom Time       | Relative Boost|
|------------|----------|--------------------|--------------------------------|--------------------------------|
| {6b, 6d} | # 126 |14.53s, 14.36s | 8.95s, `T\O`  | 1.62, `NaN` |

We see, that **even full plan isn't enought for reliable prediction** \
*P.S. situation is same with other `DOP` values*

This is the reason we started to address the problem of possible degradations. Since the estimates of plan execution time when using hintset look extremely unreliable, we propose to switch to **transitions**
$$\tau: Plan_{default} \rightarrow Plan_{custom}$$ 
by measuring the distance of the actually obtained custom plan to the plans we have already seen during training, we can get rid of most of the regressions. For more details on this technique, see the notebook `transition.ipynb`.