#### Online Index Selection Via Combinatorial Contextual Multi Armed Bandits

In [1]:
import logging
import datetime
import os
import subprocess
import uuid

import numpy as np
import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import itertools
import math
from collections import defaultdict
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

from mab import *
from dta import *

In [2]:
# read workload queries from JSON file
def read_workload(workload_filepath):
    workload = []
    with open(workload_filepath) as f:
        line = f.readline()
        # read the queries from each line
        while line:
            workload.append(json.loads(line))
            line = f.readline()

    return workload

# Base directory containing the generated queries
workload_filepath = '../datagen/TPCH_workloads/TPCH_static_100_workload.json'

# Read the workload queries from file
workload = read_workload(workload_filepath)
print(len(workload))

2100


#### MAB index selection algorithm. On each round, do the following:

1) Generate candidate arms/indices using mini-workload from previous round
2) Generate context vector for each candidate arm
3) Select the best super-arm, i.e. configuration/subset of candidate indices
4) Materialize the super-arm configuration, then execute new mini-workload for current round


We will implement these 4 steps separately in the given order.

#### 1. Generation of Candidate indices

Test index generation for miniworkload of first 21 queries

In [3]:
mab = MAB()

All non-clustered indexes --> []
All nonclustered indexes removed.


In [3]:
connection = start_connection()

miniworkload = []
for query in workload[0:21]:
    # convert to Query object
    miniworkload.append(Query(connection, query['template_id'], query['query_string'], query['payload'], query['predicates'], query['order_bys'], query['group_bys']))

close_connection(connection)

In [14]:
connection = start_connection()

# genete candidate indices
i=0
index_arms = mab.generate_candidate_indices(connection, miniworkload[i:i+1], verbose=False)
print(index_arms.keys())

close_connection(connection)

Generating candidate indices for 1 queries...


Processing queries: 100%|██████████| 1/1 [00:00<00:00, 540.64it/s]

Generated 371 candidate indices
dict_keys(['IX_lineitem_l_shipdate', 'IXN_lineitem_l_shipdate_l_di', 'IXN_lineitem_l_shipdate_l_li', 'IXN_lineitem_l_shipdate_l_qu', 'IXN_lineitem_l_shipdate_l_re', 'IXN_lineitem_l_shipdate_l_ta', 'IXN_lineitem_l_shipdate_l_ex', 'IXN_lineitem_l_shipdate_l_di_l_li', 'IXN_lineitem_l_shipdate_l_di_l_qu', 'IXN_lineitem_l_shipdate_l_di_l_re', 'IXN_lineitem_l_shipdate_l_di_l_ta', 'IXN_lineitem_l_shipdate_l_di_l_ex', 'IXN_lineitem_l_shipdate_l_li_l_qu', 'IXN_lineitem_l_shipdate_l_li_l_re', 'IXN_lineitem_l_shipdate_l_li_l_ta', 'IXN_lineitem_l_shipdate_l_li_l_ex', 'IXN_lineitem_l_shipdate_l_qu_l_re', 'IXN_lineitem_l_shipdate_l_qu_l_ta', 'IXN_lineitem_l_shipdate_l_qu_l_ex', 'IXN_lineitem_l_shipdate_l_re_l_ta', 'IXN_lineitem_l_shipdate_l_re_l_ex', 'IXN_lineitem_l_shipdate_l_ta_l_ex', 'IXN_lineitem_l_shipdate_l_di_l_li_l_qu', 'IXN_lineitem_l_shipdate_l_di_l_li_l_re', 'IXN_lineitem_l_shipdate_l_di_l_li_l_ta', 'IXN_lineitem_l_shipdate_l_di_l_li_l_ex', 'IXN_lineitem_l_




#### 2. Generation of Context Vectors for Each Arm/Index

The context vector of each index can be defined as a concatenation of two pieces:

* Columns Piece:  a vector with length equal to the total number of columns in the database. Each entry in this vector corresponds to one of the columns and contains the value $10^{-j}$ where $j$ is the position of that column in the index, provided that column is in the index, otherwise the value is zero. 

* Derived Context Piece: a vector of length 2, first component contains time stamp of last round when the index was used and second component is the size of the index relative to the entire database

In [15]:
connection = start_connection()

# test context vector generation
context_vectors = mab.generate_contexts(connection, index_arms)
print(context_vectors.shape)

close_connection(connection)

(371, 124)


#### 3: Generation of super-arm/ best configuration

To generate the best configurations:

* compute the estimated upper bound on expected reward from each index
* then solve the 0-1 knapsack problem to find the subset of indices which maximizes total expected reward upper bound while satisfying memory constraint

In [17]:
# test super arm selection
selected_indices = mab.select_best_configuration(context_vectors, index_arms, config_memory_budget_MB=5*1024, verbose=True)


Filtered upper bounds: {'IX_lineitem_l_shipdate': 0.0010893481810414496, 'IXN_lineitem_l_shipdate_l_li': 0.5857764017553204, 'IXN_lineitem_l_shipdate_l_re': 0.5857764017553204, 'IXN_lineitem_l_shipdate_l_li_l_re': 1.034876564504173, 'IX_lineitem_l_returnflag': 0.0, 'IXN_lineitem_l_returnflag_l_li': 0.5849596287698582, 'IX_lineitem_l_linestatus': 0.0, 'IXN_lineitem_l_linestatus_l_re': 0.5849596287698582, 'IX_lineitem_l_shipdate_l_returnflag': 0.008706588662745762, 'IXN_lineitem_l_shipdate_l_returnflag_l_li': 0.5912011628802578, 'IX_lineitem_l_shipdate_l_linestatus': 0.008706588662745762, 'IXN_lineitem_l_shipdate_l_linestatus_l_re': 0.5912011628802578, 'IX_lineitem_l_returnflag_l_shipdate': 0.008706588662745762, 'IXN_lineitem_l_returnflag_l_shipdate_l_li': 0.5912011628802578, 'IX_lineitem_l_returnflag_l_linestatus': 0.00755929702917113, 'IX_lineitem_l_linestatus_l_shipdate': 0.008706588662745762, 'IXN_lineitem_l_linestatus_l_shipdate_l_re': 0.5912011628802578, 'IX_lineitem_l_linestatus_l

#### 4. Meterialize super-arm configuration and execute mini-workload, observe rewards 

In [43]:
connection = start_connection()

total_execution_cost, creation_cost, index_rewards = mab.materialize_execute(connection, selected_indices, miniworkload, index_arms, verbose=False)

print(index_rewards)

close_connection(connection)


Total execution cost: 28.466
{'IX_supplier_s_suppkey_s_nationkey': [0.019999999999999997, -0.01], 'IX_customer_c_mktsegment': [0.032, -0.239], 'IX_supplier_s_nationkey': [0.0035, -0.013], 'IX_customer_c_nationkey': [0.031, -0.127], 'IX_customer_c_custkey_c_nationkey': [0.0155, -0.113], 'IX_supplier_s_nationkey_s_suppkey': [0.0035, -0.013], 'IX_customer_c_custkey': [-0.011000000000000003, -0.183], 'IX_supplier_s_comment': [0.002999999999999999, -0.042], 'IX_part_p_container': [0.006666666666666668, -0.172], 'IX_part_p_brand_p_partkey': [0.001666666666666668, -0.431], 'IX_part_p_size_p_partkey': [0.016, -0.126], 'IX_customer_c_phone': [-0.10350000000000001, -0.156], 'IX_customer_c_acctbal_c_custkey': [-0.01750000000000001, -0.105], 'IX_part_p_brand_p_size_p_partkey': [0, -0.382], 'IX_customer_c_acctbal': [0, -0.157], 'IX_customer_c_nationkey_c_custkey': [0, -0.255], 'IX_part_p_partkey_p_brand': [0, -0.144], 'IX_customer_c_phone_c_custkey': [0, -0.214], 'IX_part_p_partkey_p_brand_p_size':

In [44]:
mab.update_parameters(selected_indices, index_rewards, index_arms)


#### Full MAB rounds

In [6]:
mab = MAB(alpha_decay_rate=0.95)

All non-clustered indexes --> [('dbo', 'lineitem', 'IXN_lineitem_l_returnflag_l_shipdate_l_linestatus_l_ta_l_ex_l_di_l_qu')]
All nonclustered indexes removed.


In [7]:
# run rounds of MAB with the same batch of queries (first 3 queries in the workload)
for i in range(1, 21):
    mab.step_round(miniworkload, i, config_memory_budget_MB=5*1024, verbose=False)

Running MAB agent for round 1...
Identifying new query templates and updating statistics...
Number of new query templates added: 21
Number of queries of interest: 0
Materializing configuration and executing queries...
Num indexes added:0, Num indexes removed: 0 Index creation costs: {}
Observed Rewards for indexes: {}

Total execution cost: 173.028, Total index creation cost: 0

Index selection counts: {}

Round completed. Time taken for recommendation: 0:03:27.164178 seconds.

Running MAB agent for round 2...
Identifying new query templates and updating statistics...
Number of new query templates added: 0
Number of queries of interest: 21
Generating candidate indices...
Generating candidate indices for 21 queries...


Processing queries: 100%|██████████| 21/21 [00:00<00:00, 131.70it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
Table filtered indexes: defaultdict(<class 'list'>, {'lineitem': ['IXN_lineitem_l_returnflag_l_shipdate_l_linestatus_l_ta_l_ex_l_di_l_qu', 'IXN_lineitem_l_returnflag_l_linestatus_l_shipdate_l_ta_l_ex_l_di_l_qu', 'IXN_lineitem_l_shipdate_l_returnflag_l_linestatus_l_ta_l_ex_l_di_l_qu'], 'part': ['IXN_part_p_type_p_size_p_mf_p_pa', 'IXN_part_p_size_p_type_p_mf_p_pa', 'IXN_part_p_type_p_mf_p_pa'], 'partsupp': ['IXN_partsupp_ps_suppkey_ps_p_ps_s_ps_a', 'IXN_partsupp_ps_suppkey_ps_s_ps_a', 'IXN_part

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 204.03it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -7.05719904e+00 -8.16186779e-02  0.00000000e+00
  0.00000000e+00  9.13777437e-02  0.00000000e+00  0.00000000e+00
 -4.25637766e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  3.97578942e+00  0.00000000e+00 -6.31352659e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  3.91367612e-02  1.88904666e-02
 -2.89242718e-03  0.00000000e+00  0.00000000e+00  1.33184992e-01
 -8.36181654e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  4.27555556e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 125.74it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -6.63193537e+01 -9.96139492e-02  9.14550124e-02
  0.00000000e+00  7.69758920e-02  0.00000000e+00 -9.74498758e-03
 -4.37219638e-02 -9.74498758e-03  0.00000000e+00  0.00000000e+00
  2.66065430e+00  2.46535403e-01 -6.90395913e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -3.02363636e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  5.69191382e-03  0.00000000e+00  5.55517126e-02  4.31849015e-02
 -2.15971115e-03  0.00000000e+00  0.00000000e+00  1.33053554e-01
 -8.39371964e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -2.12409633e-02

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 211.87it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -9.51148047e+01 -1.03922950e-01  8.39889912e-02
  0.00000000e+00  7.81740556e-02  0.00000000e+00 -1.72110088e-02
  3.28205579e-02 -2.00078529e-02  0.00000000e+00  0.00000000e+00
  1.78152647e+00  3.41234153e-01 -7.29864531e+00  0.00000000e+00
  0.00000000e+00 -3.29463676e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -3.02363636e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.15212944e-01
  0.00000000e+00  1.89816170e-01  0.00000000e+00  0.00000000e+00
  8.38495759e-03  0.00000000e+00  7.19189485e-02  5.01659827e-02
  3.70611187e-02  0.00000000e+00  0.00000000e+00  2.01040733e-01
  9.78916200e-03  0.00000000e+00  0.00000000e+00 -8.41358013e-03
  0.00000000e+00 -1.39979621e-01  1.40201664e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 127.97it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.19515764e+02 -8.88440066e-02  8.91734160e-03
  1.52651458e-01  1.23033830e-01  0.00000000e+00 -3.12220753e-02
  2.41168786e-02 -3.40498447e-02  0.00000000e+00  0.00000000e+00
  1.95596852e+00  5.82978945e-01 -7.22032921e+00  0.00000000e+00
  0.00000000e+00 -1.45071239e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.41531915e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.41531915e-01 -3.53829787e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.19720489e-01
  0.00000000e+00  1.93573308e-01  0.00000000e+00  3.81274805e-01
  8.12987088e-03  0.00000000e+00  1.55873968e-02  6.86474499e-02
 -2.71240136e-05  0.00000000e+00  0.00000000e+00  1.84270236e-01
  1.32992380e-02  0.00000000e+00  9.84319150e-02 -1.00877753e-02
  0.00000000e+00 -2.10087509e-01

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 132.55it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.29735161e+02 -3.39073026e-02 -2.18553362e-02
  1.07282666e-01  2.26879442e-01  0.00000000e+00 -2.27817451e-01
 -2.14231868e-02  1.36196420e-01  0.00000000e+00  0.00000000e+00
  2.03361347e+00  8.80417330e-01 -7.18547035e+00  0.00000000e+00
  0.00000000e+00 -5.25170597e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.60289157e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.60289157e-01 -3.60650602e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.11549767e-01
  0.00000000e+00  1.86905157e-01  0.00000000e+00  3.82964210e-01
 -3.41052496e-02  0.00000000e+00  1.19833171e-01  1.34582935e-01
  2.53495937e-02  0.00000000e+00  0.00000000e+00  1.76531642e-01
  6.46947283e-02  0.00000000e+00  1.11809319e-01 -9.47154018e-03
  0.00000000e+00 -2.84118919e-01  1.70484995e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 191.16it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.30071733e+02 -4.09877256e-02 -6.65562980e-02
  4.78409654e-02  2.53093872e-01  0.00000000e+00 -6.03186574e-02
  1.43450583e-01  3.23209051e-01  0.00000000e+00  0.00000000e+00
  2.07409392e+00  1.26390211e+00 -7.16729658e+00  0.00000000e+00
  0.00000000e+00 -8.88658650e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.67697479e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.67697479e-01 -3.63344538e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.11355721e-01
  0.00000000e+00  1.86677763e-01  0.00000000e+00  3.82175943e-01
 -1.43985860e-02  0.00000000e+00  1.21273419e-01  1.59157634e-01
  5.14295850e-02  0.00000000e+00  1.48666667e-01  1.32803289e-01
  9.03455036e-02  0.00000000e+00  1.20677834e-01 -1.18395731e-02
  0.00000000e+00 -3.24654756e-01  2.10059072e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 130.43it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.30346533e+02  3.31264800e-02 -5.38258068e-02
  6.53531093e-02  2.26366914e-01  0.00000000e+00 -2.20367133e-02
  2.77460411e-01  2.26915419e-01  0.00000000e+00  0.00000000e+00
  1.90493853e+00  1.39944598e+00 -7.24323923e+00  0.00000000e+00
  0.00000000e+00 -8.01259337e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.71664516e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.71664516e-01 -3.64787097e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.11233665e-01
  0.00000000e+00  1.07033665e-01  0.00000000e+00  3.09530844e-01
 -4.44399034e-03  0.00000000e+00  1.22301391e-01  1.59681226e-01
  7.83849073e-02  0.00000000e+00  1.48666667e-01  2.27748991e-01
  1.64398238e-01  0.00000000e+00  1.61886448e-01 -3.65819877e-03
  0.00000000e+00 -3.64496639e-01

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 203.74it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.32979540e+02  6.94346790e-02  8.93090762e-02
  2.45965208e-01  2.89376338e-01  0.00000000e+00 -4.08551663e-02
  2.13734645e-01  2.73573738e-01  0.00000000e+00  0.00000000e+00
  1.77684643e+00  4.96189746e+00 -7.30074643e+00  0.00000000e+00
  0.00000000e+00  3.63214295e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.74136126e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.74136126e-01 -3.65685864e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.10706769e-01
  0.00000000e+00  1.06506769e-01  0.00000000e+00  2.90802968e-01
  1.92690316e-02  0.00000000e+00  1.27701931e-01  1.61476514e-01
  3.61059891e-02  0.00000000e+00  1.48666667e-01  9.33824829e-02
  8.83401450e-02  0.00000000e+00  1.33922854e-01  7.41113118e-03
  0.00000000e+00 -3.79992697e-01 -7.31676981e-02  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 133.44it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34098705e+02 -1.71451040e-01 -4.52459248e-02
  2.15620609e-01  2.71932849e-01  0.00000000e+00 -6.11101773e-02
  1.98047618e-01  1.71185923e-01  0.00000000e+00  0.00000000e+00
  1.81453409e+00  4.44482563e+00 -6.76140458e+00  0.00000000e+00
  0.00000000e+00  2.05992210e-03  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.75823789e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.75823789e-01 -3.66299559e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.09719509e-01
  0.00000000e+00  1.05519509e-01  0.00000000e+00  3.26938022e-01
  3.49120956e-02  0.00000000e+00  1.39260527e-01  1.61179463e-01
  3.16641409e-02  0.00000000e+00  1.48666667e-01  9.12549980e-02
  8.59160828e-02  0.00000000e+00  1.16381408e-01  1.66199025e-02
  0.00000000e+00 -3.85004864e-01

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 203.31it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34123521e+02 -1.68263480e-01 -4.14428678e-02
  2.47403384e-01  3.40771996e-01  2.66000000e-01 -4.13612882e-02
  2.17388411e-01  2.04145524e-01  0.00000000e+00  0.00000000e+00
  1.81748467e+00  4.96777094e+00 -6.17511169e+00  0.00000000e+00
  0.00000000e+00 -6.22431265e-04  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.77049430e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.77049430e-01 -3.66745247e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.08916108e-01
  0.00000000e+00  1.04716108e-01  0.00000000e+00  4.00008744e-01
  6.38513520e-02  0.00000000e+00  1.47856082e-01  1.62999011e-01
  3.21097173e-02  0.00000000e+00  1.48666667e-01  9.49530614e-02
  9.05382061e-02  0.00000000e+00  1.17014617e-01  1.85428540e-02
  0.00000000e+00 -4.23035569e-01 -2.35839252e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 132.26it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34237462e+02 -1.60581517e-01 -5.14511767e-02
  9.68039896e-02  3.70319806e-01  2.66000000e-01 -3.31901107e-02
  2.10359547e-01  1.75584767e-01  0.00000000e+00  0.00000000e+00
  1.82666784e+00  5.28011448e+00 -5.57153324e+00  0.00000000e+00
  0.00000000e+00 -8.97075942e-03  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.77979933e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.77979933e-01 -3.67083612e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.08590932e-01
  0.00000000e+00  1.04390932e-01  0.00000000e+00  4.04275139e-01
  8.51850653e-02  0.00000000e+00  1.50854092e-01  1.64958877e-01
 -2.74534389e-02  0.00000000e+00  1.60400000e-01  8.40519829e-02
  3.62400236e-02  0.00000000e+00  7.38574039e-02  2.07147701e-02
  0.00000000e+00 -4.62577026e-01

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 195.14it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34247120e+02 -1.85538210e-01 -3.25236632e-02
  1.33468090e-01  3.78477402e-01  2.66000000e-01  1.53692831e-02
  2.31045258e-01  2.06190630e-01  0.00000000e+00  0.00000000e+00
  1.83758570e+00  5.52506022e+00 -4.96836083e+00  0.00000000e+00
  0.00000000e+00 -1.88960864e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.78710448e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.78710448e-01 -3.67349254e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.08200808e-01
  0.00000000e+00  1.04000808e-01  0.00000000e+00  4.13876329e-01
  1.10779847e-01  0.00000000e+00  1.54450907e-01  1.67310196e-01
 -2.19577411e-02  0.00000000e+00  1.60400000e-01  5.72833269e-02
  6.28592720e-02  0.00000000e+00  6.78114963e-02  2.22722002e-02
  0.00000000e+00 -4.85973466e-01 -2.85485657e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 133.32it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34247138e+02 -2.05566873e-01 -1.57065005e-02
  1.68934369e-01  4.01765070e-01  2.66000000e-01  6.13496824e-02
  2.52955548e-01  2.39157129e-01  0.00000000e+00  0.00000000e+00
  1.85003107e+00  5.71098368e+00 -4.36789154e+00  0.00000000e+00
  0.00000000e+00 -3.02100596e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.79299191e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.79299191e-01 -3.67563342e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.07839573e-01
  0.00000000e+00  1.03639573e-01  0.00000000e+00  4.20925281e-01
  1.34479277e-01  0.00000000e+00  1.57781369e-01  1.69487395e-01
  1.53524678e-02  0.00000000e+00  1.60400000e-01  6.92128794e-02
  6.77144339e-02  0.00000000e+00  6.53288556e-02  2.47785540e-02
  0.00000000e+00 -5.15725434e-01

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 129.13it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34265321e+02 -2.60654664e-01 -3.95427001e-02
  9.68432650e-02  4.59652977e-01  2.66000000e-01  1.57235535e-01
  2.41491378e-01  2.03174326e-01  0.00000000e+00  0.00000000e+00
  1.87388729e+00  5.55257186e+00 -3.73057136e+00  0.00000000e+00
  0.00000000e+00 -5.18975386e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.79783784e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.79783784e-01 -3.67739558e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.07722164e-01
  0.00000000e+00  1.03522164e-01  0.00000000e+00  3.99032491e-01
  1.42182116e-01  0.00000000e+00  1.58863843e-01  1.70195032e-01
  4.79524024e-02  0.00000000e+00  1.60400000e-01  7.57447281e-02
  7.66637921e-02  0.00000000e+00  6.91230701e-02  2.87783859e-02
  0.00000000e+00 -5.36558624e-01

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 199.95it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34295625e+02 -2.61337571e-01 -4.19997211e-02
  9.78148201e-02  4.45265563e-01  2.66000000e-01  1.42735716e-01
  2.33724006e-01  2.05213310e-01  0.00000000e+00  0.00000000e+00
  1.89704251e+00  5.41023332e+00 -3.09767025e+00  0.00000000e+00
  0.00000000e+00 -7.29477386e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.80189616e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.80189616e-01 -3.67887133e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.07633195e-01
  0.00000000e+00  1.03433195e-01  0.00000000e+00  3.82323294e-01
  1.48019061e-01  0.00000000e+00  1.59684105e-01  1.70731256e-01
  6.17480597e-02  0.00000000e+00  1.60400000e-01  1.01459112e-01
  7.53613944e-02  0.00000000e+00  1.02295269e-01  3.22974642e-02
  0.00000000e+00 -5.53116433e-01 -4.29150319e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 136.26it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34438286e+02 -2.68734261e-01 -4.69845704e-02
  9.25915755e-02  4.38102944e-01  2.66000000e-01  1.37267249e-01
  2.24967345e-01  2.05237632e-01  0.00000000e+00  0.00000000e+00
  1.91864325e+00  5.30644833e+00 -2.47600821e+00  0.00000000e+00
  0.00000000e+00 -9.25847769e-02  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.80534447e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.80534447e-01 -3.68012526e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.08511755e-01
  0.00000000e+00  1.04311755e-01  0.00000000e+00  3.73194589e-01
  1.06828112e-01  0.00000000e+00  1.50761662e-01  1.66118577e-01
  1.24249724e-01  0.00000000e+00  1.60400000e-01  1.23139215e-01
  1.23379837e-01  0.00000000e+00  1.16300395e-01  3.28470968e-02
  0.00000000e+00 -5.66499974e-01 -4.36056454e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 205.34it/s]

Generated 24142 candidate indices
Generating context vectors...





Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34438306e+02 -2.67851351e-01 -4.96521539e-02
  9.51497583e-02  4.17833921e-01  2.66000000e-01  1.16493275e-01
  2.14964549e-01  2.08353879e-01  0.00000000e+00  0.00000000e+00
  1.93774123e+00  5.28215252e+00 -1.85851777e+00  0.00000000e+00
  0.00000000e+00 -1.09946569e-01  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.80831068e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.80831068e-01 -3.68120388e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.09009743e-01
  0.00000000e+00  1.04809743e-01  0.00000000e+00  3.62386458e-01
  8.44205843e-02  0.00000000e+00  1.45657188e-01  1.63543024e-01
  9.39779560e-02  0.00000000e+00  1.60400000e-01  1.26164078e-01
  9.23874967e-02  0.00000000e+00  1.02969850e-01  3.47105059e-02
  0.00000000e+00 -5.89233457e-01 -4.61734243e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+0

Processing queries: 100%|██████████| 21/21 [00:00<00:00, 133.67it/s]


Generated 24142 candidate indices
Generating context vectors...
Selecting best configuration...
Theta - parameter vector: 
[ 0.00000000e+00 -1.34455397e+02 -2.70390752e-01 -5.40901739e-02
  9.45524075e-02  3.98966266e-01  2.66000000e-01  9.80846587e-02
  2.03033007e-01  2.10769575e-01  0.00000000e+00  0.00000000e+00
  1.95824985e+00  5.20596680e+00 -1.24529994e+00  0.00000000e+00
  0.00000000e+00 -1.28590776e-01  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.81088929e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.81088929e-01 -3.68214156e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.08947866e-01
  0.00000000e+00  1.04747866e-01  0.00000000e+00  3.53224553e-01
  8.99988026e-02  0.00000000e+00  1.46151738e-01  1.63978976e-01
  1.08265257e-01  0.00000000e+00  1.60400000e-01  1.35452164e-01
  9.45534912e-02  0.00000000e+00  1.08314287e-01  3.61352922e-02
  0.00000000e+00 -6.04661457e-01

#### Comparison with DTA recommendation

In [10]:
dta = DTA_recommender()

In [13]:
queries = [q.query_string for q in miniworkload[:1]]
indexes_to_add, indexes_to_remove = dta.recommend_indexes(queries,  max_memory_Mb=20*1024, max_time_minutes=20) 

All non-clustered indexes --> []
All nonclustered indexes removed.
Microsoft (R) SQL Server dta
Version 20.2.30.0
Copyright (c) Microsoft. All rights reserved.

Tuning session successfully created. Session ID is 81.

Time elapsed: 00:00:02            
Workload consumed:  100%, Estimated improvement:   59%                         
Total time used: 00:00:02                                


                                                                               
                                                                                
Tuning process finished.
Successfully saved output XML file: \\wsl.localhost\Ubuntu\home\tanzid\Code\DBMS\MAB\session_output_session_7368988f-9abd-4ae1-8344-5a52b2d6ab9c.xml.
Successfully generated recommendations script: \\wsl.localhost\Ubuntu\home\tanzid\Code\DBMS\MAB\recommendations_session_7368988f-9abd-4ae1-8344-5a52b2d6ab9c.sql.
DTA recommendation time --> 21.138886 seconds.


In [14]:
for index in indexes_to_add:
    print(index)

for index in indexes_to_remove:
    print(index)    
    
# convert DTA index dicts to MAB index arm ids
DTA_recommended_configuration = []
for index in indexes_to_add:
    #print(index)
    index_columns = index['index_columns']
    # remove ASC/DESC from column names
    index_columns = [re.sub(' ASC| DESC', '', col) for col in index_columns]
    index_id = get_index_id(index_columns, index['table_name'], index['include_columns'])
    DTA_recommended_configuration.append(index_id)


DTA_recommended_configuration  


{'index_name': '_dta_index_lineitem_6_965578478__K9_K10_K11_5_6_7_8', 'table_name': 'lineitem', 'index_columns': ['l_returnflag ASC', 'l_linestatus ASC', 'l_shipdate ASC'], 'include_columns': ['l_quantity', 'l_extendedprice', 'l_discount', 'l_tax']}


['IXN_lineitem_l_returnflag_l_linestatus_l_shipdate_l_qu_l_ex_l_di_l_ta']

In [19]:
mab.best_configuration

{'IX_lineitem_l_shipdate_l_orderkey': <mab.Index at 0x7f97f0003010>,
 'IX_orders_o_shippriority': <mab.Index at 0x7f97f0011f10>,
 'IX_orders_o_orderkey': <mab.Index at 0x7f97f0002510>,
 'IX_partsupp_ps_partkey_ps_suppkey': <mab.Index at 0x7f97f00c1610>,
 'IX_customer_c_mktsegment': <mab.Index at 0x7f97e82b3e50>,
 'IXN_orders_o_orderdate_o_sh': <mab.Index at 0x7f97f01c0410>,
 'IX_orders_o_orderdate_o_orderkey': <mab.Index at 0x7f97f0000a10>,
 'IX_orders_o_orderdate': <mab.Index at 0x7f97f00c23d0>,
 'IX_lineitem_l_commitdate': <mab.Index at 0x7f97f00035d0>,
 'IXN_part_p_partkey_p_mf': <mab.Index at 0x7f97f009ae90>,
 'IXN_part_p_type_p_pa': <mab.Index at 0x7f97f00c3150>,
 'IXN_partsupp_ps_suppkey_ps_s': <mab.Index at 0x7f97f00c3010>,
 'IX_supplier_s_suppkey_s_nationkey': <mab.Index at 0x7f97f005d590>,
 'IX_supplier_s_suppkey': <mab.Index at 0x7f97f00996d0>,
 'IX_lineitem_l_suppkey': <mab.Index at 0x7f97f0000dd0>,
 'IXN_part_p_size_p_mf_p_pa': <mab.Index at 0x7f97f00c1c90>,
 'IXN_part_p_si

In [94]:
# check intersection of MAB and DTA recommendations
intersection = set(DTA_recommended_configuration).intersection(set(mab.best_configuration.keys()))

intersection

set()

In [27]:
test_query = miniworkload[0]
print(f"Query: {test_query.query_string}")
print(f"Predicates: {test_query.predicates}")
print(f"Payload: {test_query.payload}")
print(f"Order by: {test_query.order_by}")
print(f"Group by: {test_query.group_by}")



Query: select
	l_returnflag,
	l_linestatus,
	sum(l_quantity) as sum_qty,
	sum(l_extendedprice) as sum_base_price,
	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
	avg(l_quantity) as avg_qty,
	avg(l_extendedprice) as avg_price,
	avg(l_discount) as avg_disc,
	count(*) as count_order
from
	lineitem
where
	l_shipdate <= DATEADD(dd, -63, CAST('1998-12-01' AS date))
group by
	l_returnflag,
	l_linestatus
order by
	l_returnflag,
	l_linestatus
;
Predicates: {'lineitem': {'l_shipdate': 'r'}}
Payload: {'lineitem': ['l_returnflag', 'l_linestatus', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax']}
Order by: {'lineitem': [['l_returnflag', 'asc'], ['l_linestatus', 'asc']]}
Group by: {'lineitem': ['l_returnflag', 'l_linestatus']}
