#### Online Index Selection Via Combinatorial Contextual Multi Armed Bandits

In [44]:
import logging
import datetime
import os
import subprocess
import uuid

import numpy as np
import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import itertools
import math
from collections import defaultdict
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# read workload queries from JSON file
def read_workload(workload_filepath):
    workload = []
    with open(workload_filepath) as f:
        line = f.readline()
        # read the queries from each line
        while line:
            workload.append(json.loads(line))
            line = f.readline()

    return workload

# Base directory containing the generated queries
workload_filepath = '../datagen/TPCH_workloads/TPCH_static_100_workload.json'

# Read the workload queries from file
workload = read_workload(workload_filepath)
print(len(workload))

2100


#### MAB index selection algorithm. On each round, do the following:

1) Generate candidate arms/indices using mini-workload from previous round
2) Generate context vector for each candidate arm
3) Select the best super-arm, i.e. configuration/subset of candidate indices
4) Materialize the super-arm configuration, then execute new mini-workload for current round


We will implement these 4 steps separately in the given order.

#### 1. Generation of Candidate indices

In [50]:
"""
   Index class definition
"""
class Index:
    def __init__(self, table_name, index_id, index_columns, size, include_columns=(), value=None, payload_only=False):
        self.table_name = table_name
        self.index_id = index_id
        self.index_columns = index_columns
        self.size = size
        self.include_columns = include_columns
        self.value = value
        self.query_template_ids = None
        self.clustered_index_time = None
        self.context_vector_columns = None
        self.payload_only = payload_only
        self.index_usage_last = 0 

    def __str__(self):
        return f"Index({self.table_name}, {self.index_id}, {self.index_columns}, {self.include_columns}, {self.size}, {self.value})"



"""
    Given a query, generate candidate indices based on the predicates and payload columns in the query.
"""
def generate_candidate_indices_from_predicates(connection, query, MAX_COLUMNS=6, SMALL_TABLE_IGNORE=10000, TABLE_MIN_SELECTIVITY=0.2, verbose=False):
    # get all tables in the db
    tables = get_all_tables(connection)
    if verbose:
        print(f"Tables:")
        for key in tables:
            print(tables[key])

    query_template_id = query.template_id
    query_predicates = query.predicates
    query_payload = query.payload
    
    indices = {}

    # indexes on predicate columns only
    for table_name, table_predicates in query_predicates.items():
        table = tables[table_name]
        if verbose: print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
        
        # identify include columns
        include_columns = []
        if table_name in query_payload:
            include_columns = list(set(query_payload[table_name]) - set(table_predicates))
        
        if verbose: 
            print(f"Include columns: {include_columns}")
            print(f"Query selectivity: {query.selectivity[table_name]}")


        # check if conditions for cheap full table scan are met
        if table.row_count < SMALL_TABLE_IGNORE or ((query.selectivity[table_name] > TABLE_MIN_SELECTIVITY) and (len(include_columns)>0)):
            if verbose: print(f"Full table scan for table: {table_name} is cheap, skipping")
            continue

        # generate all possible permutations of predicate columns, from single column up to MAX_COLUMNS-column indices
        table_predicates = list(table_predicates.keys())  #[0:6]
        col_permutations = []
        for num_columns in range(1, min(MAX_COLUMNS, len(table_predicates)+1)):
            col_permutations = col_permutations + list(itertools.permutations(table_predicates, num_columns)) 
        
        if verbose: print(f"Column permutations: \n{col_permutations}")

        # assign an id and value to each index/column permutation
        for cp in col_permutations:
            index_id = get_index_id(cp, table_name)
            
            if index_id not in indices:
                index_size = get_estimated_index_size(connection, table_name, cp)
                if verbose:  print(f"index_id: {index_id}, index columns: {cp}, index size: {index_size:.2f} Mb")
                # assign value...

                # create index object
                indices[index_id] = Index(table_name, index_id, cp, index_size)

    # indexes on columns that are in the payload but not in the predicates
    for table_name, table_payload in query_payload.items():
        table = tables[table_name]
        if verbose: print(f"\nTable --> {table_name}, Payload Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
        
        # skip if any of the payload columns for this table are in the predicates
        if table_name in query_predicates:
            if verbose: print(f"Payload columns are in the predicates, skipping")
            continue

        # check if conditions for cheap full table scan are met
        if table.row_count < SMALL_TABLE_IGNORE:
            if verbose: print(f"Full table scan for table: {table_name} is cheap, skipping")
            continue   

        # don't need to consider permutations here, just create an index with all payload columns in given order
        index_id = get_index_id(table_payload, table_name)
        if index_id not in indices:
            index_size = get_estimated_index_size(connection, table_name, table_payload)
            print(f"index_id: {index_id}, index columns: {table_payload}, index size: {index_size:.2f} Mb")
            # assign value... (will assign less value to these indices as they are less useful compared to predicate indices)
            
            indices[index_id] = Index(table_name, index_id, table_payload, index_size, payload_only=True)

    # indexes with include columns
    for table_name, table_predicates in query_predicates.items():
        table = tables[table_name]
        if verbose: print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
        
        # check if conditions for cheap full table scan are met
        if table.row_count < SMALL_TABLE_IGNORE:
            if verbose: print(f"Full table scan for table: {table_name} is cheap, skipping")
            continue  

        # identify include columns
        include_columns = []
        if table_name in query_payload:
            include_columns = sorted(list(set(query_payload[table_name]) - set(table_predicates)))

        if len(include_columns)>0:    
            if verbose: print(f"Include columns: {include_columns}")

            # generate all possible permutations of predicate columns
            table_predicates = list(table_predicates.keys())#[0:6]
            #col_permutations = list(itertools.permutations(table_predicates, len(table_predicates))) 
            col_permutations = list(itertools.permutations(table_predicates, MAX_COLUMNS)) 
            
            if verbose: print(f"Column permutations: \n{col_permutations}")

            # assign an id and value to each index/column permutation
            for cp in col_permutations:
                index_id = get_index_id(cp, table_name, include_columns)
                if index_id not in indices:
                    index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
                    if verbose: print(f"index_id: {index_id}, index columns: {cp}, include columns: {include_columns}, index size: {index_size:.2f} Mb")
                    # assign value...
                    
                    # create index object
                    indices[index_id] = Index(table_name, index_id, cp, index_size, tuple(include_columns))
            
    return indices        



"""
    Given a miniworkload, which is a list of query objects, generate candidate indices
"""
def generate_candidate_indices(connection, miniworkload, verbose=False):
    print(f"Gnereting candidate indices for {len(miniworkload)} queries...")
    index_arms = {} 
    for query in tqdm(miniworkload, desc="Processing queries"):
        query_candidate_indices = generate_candidate_indices_from_predicates(connection, query, verbose=verbose)
        for index_id, index in query_candidate_indices.items():
            if index_id not in index_arms:
                # initialization
                index.query_template_ids = set()
                index.clustered_index_time = 0
                index_arms[index_id] = index

            # add the maximum table scan time for the table associated with this index and query template
            index_arms[index_id].clustered_index_time += max(query.table_scan_times[index.table_name] if query.table_scan_times[index.table_name] else 0)   
            index_arms[index_id].query_template_ids.add(query.template_id)

    return index_arms


Test index generation for miniworkload of first 21 queries

In [51]:
connection = start_connection()

miniworkload = []
for query in workload[0:21]:
    # convert to Query object
    miniworkload.append(Query(connection, query['template_id'], query['query_string'], query['payload'], query['predicates'], query['order_bys']))

# genete candidate indices
index_arms = generate_candidate_indices(connection, miniworkload, verbose=False)

close_connection(connection)

Gnereting candidate indices for 21 queries...


Processing queries: 100%|██████████| 21/21 [00:01<00:00, 17.55it/s]


#### 2. Generation of Context Vectors for Each Arm/Index

The context vector of each index can be defined as a concatenation of two pieces:

* Columns Piece:  a vector with length equal to the total number of columns in the database. Each entry in this vector corresponds to one of the columns and contains the value $10^{-j}$ where $j$ is the position of that column in the index, provided that column is in the index, otherwise the value is zero. 

* Derived Context Piece: a vector of length 2, first component contains time stamp of last round when the index was used and second component is the size of the index relative to the entire database

In [52]:
connection = start_connection()

all_columns, num_columns = get_all_columns(connection)

close_connection(connection)

In [53]:
columns_to_idx = {}
i = 0
for table_name, columns in all_columns.items():
    for column in columns:
        columns_to_idx[column] = i
        i += 1

idx_to_columns = {v: k for k, v in columns_to_idx.items()}       

In [54]:
# generate columns piece
def generate_context_vector_columns_index(index, columns_to_idx, idx_to_columns):
    # return the cached context vector if available
    if index.context_vector_columns:
        return index.context_vector_columns

    context_vector = np.zeros(len(columns_to_idx), dtype=float)
    for j, column in enumerate(index.index_columns):
        context_vector[columns_to_idx[column]] = 10**(-j)

    # cache the context vector
    index.encode_context_vector = context_vector    

    return context_vector    


def generate_context_vector_columns(index_arms, columns_to_idx, idx_to_columns):
    # stack up the context vectors for all indices into a single matrix
    context_vectors = np.vstack([generate_context_vector_columns_index(index, columns_to_idx, idx_to_columns) for index in index_arms.values()])

    return context_vectors


# generate derived piece
def generate_context_vector_derived(connection, index_arms):
    database_size = get_database_size(connection)
    derived_context_vectors = np.zeros((len(index_arms), 2), dtype=float)
    for i, index in enumerate(index_arms.values()):
        derived_context_vectors[i,0] =  index.index_usage_last
        derived_context_vectors[i,1] =  index.size/database_size
    
    return derived_context_vectors


In [55]:
connection = start_connection()

# test context vector generation
context_vectors_columns = generate_context_vector_columns(index_arms, columns_to_idx, idx_to_columns)
print(context_vectors_columns.shape)

context_vectors_derived = generate_context_vector_derived(connection, index_arms)
print(context_vectors_derived.shape)

close_connection(connection)

(652, 61)
(652, 2)
