#### Online Index Selection using the Work Function Algorithm (WFA)

In [2]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict
from tqdm import tqdm
import pickle

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

#### First, we will define some helper functions for generating list of all possible configurations subject to constraints (i.e. max number of columns per index, max number of indices per configuration), along with cost estimation (such as transition costs and query execution cost in a hypothetical configuration). We will also precompute estimates of all index creation costs.

In [3]:
class Index:
    def __init__(self, table_name, index_id, index_columns, size, include_columns=(), value=None):
        self.table_name = table_name
        self.index_id = index_id
        self.index_columns = index_columns
        self.size = size
        self.include_columns = include_columns
        self.value = value

    def __str__(self):
        return f"Index({self.table_name}, {self.index_id}, {self.index_columns}, {self.include_columns}, {self.size}, {self.value})"


"""
    Function for generating all possible configurations (i.e. subsets of indixes) and also precomputing index creation cost estimates
"""
def generate_all_configurations(connection, MAX_INDICES_PER_CONFIG=2, MAX_COLS=2, verbose=False):
    # first, generate all possible indices
    tables = get_all_tables(connection)
    all_indices = {} 
    # tqdm bar around table loop
    for table_name, table in tqdm(tables.items(), desc="Generating all indices for table:"):
        columns = table.get_columns()
        if verbose:
            print(f"Table --> {table_name} with columns --> {columns}")
        # get all possible permutations of columns, up to MAX_KEY_COLS columns
        for num_columns in range(1, MAX_COLS+1):
            col_permutations = list(itertools.permutations(columns, num_columns))
            # also generate permutations of columns with include columns
            for cp in col_permutations:
                # get columns not in cp
                include_columns = list(set(columns) - set(cp))
                # get all permutations of MAX_INCLUDE_COLS include columns
                include_col_permutations = list(itertools.permutations(include_columns, MAX_COLS-num_columns))
                for icp in include_col_permutations:
                    index_id = get_index_id(cp, table_name, include_columns)
                    if index_id not in all_indices:
                        index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
                        # create index object
                        all_indices[index_id] = Index(table_name, index_id, cp, index_size, tuple(icp))

    print(f"Total number of indices generated: {len(all_indices)}, total estimated size: {sum([i.size for i in all_indices.values()]):.2f} Mb")

    # now estimate the creation cost of each index (we this by creating the index and then dropping it, which is potentially 
    # very expensive, but I don't know a more efficient way)
    index_creation_cost = {}
    for index_id, index in tqdm(all_indices.items(), desc="Estimating index creation cost:"):
        index_creation_cost[index_id] = create_nonclustered_index_object(connection, index)
        drop_noncluster_index_object(connection, index)

    # now generate all possible configurations with up to MAX_INDICES_PER_CONFIG indices
    all_configurations = []
    print(f"Gneretaing all possible configurations with up to {MAX_INDICES_PER_CONFIG} indices.")
    for num_indices in range(1, MAX_INDICES_PER_CONFIG+1):
        all_configurations.append(list(itertools.combinations(all_indices.values(), num_indices)))

    all_configurations = list(itertools.chain.from_iterable(all_configurations))


    return all_indices, index_creation_cost, all_configurations, tables


#### Generate all possible configurations with up to 2 indices, each index containing up to 2 columns.

In [4]:
connection = start_connection() 

# wipe out all non clustered indices from database
remove_all_nonclustered_indexes(connection)

all_indices, index_creation_cost, all_configurations, tables = generate_all_configurations(connection, MAX_INDICES_PER_CONFIG=2, MAX_COLS=2, verbose=True)

close_connection(connection)        

All non-clustered indexes --> [('dbo', 'customer', 'IX_customer_c_custkey'), ('dbo', 'lineitem', 'IXN_lineitem_l_shipdate_l_suppkey_l_partkey_l_qu'), ('dbo', 'lineitem', 'IXN_lineitem_l_suppkey_l_partkey_l_orderkey_l_di_l_ex_l_qu'), ('dbo', 'partsupp', 'IX_partsupp_ps_partkey'), ('dbo', 'supplier', 'IX_supplier_s_nationkey'), ('dbo', 'supplier', 'IX_supplier_s_nationkey_s_suppkey'), ('dbo', 'supplier', 'IX_supplier_s_suppkey_s_nationkey')]
All nonclustered indexes removed.


Generating all indices for table:: 100%|██████████| 8/8 [00:00<00:00, 1884.87it/s]


Table --> customer with columns --> {'c_custkey': <utils.Column object at 0x7f24ea917d90>, 'c_name': <utils.Column object at 0x7f24ea916cd0>, 'c_address': <utils.Column object at 0x7f24eae2ab10>, 'c_nationkey': <utils.Column object at 0x7f24eae28c10>, 'c_phone': <utils.Column object at 0x7f24eae2acd0>, 'c_acctbal': <utils.Column object at 0x7f24eae2aa10>, 'c_mktsegment': <utils.Column object at 0x7f24eae2ad90>, 'c_comment': <utils.Column object at 0x7f24eae2a150>}
Table --> orders with columns --> {'o_orderkey': <utils.Column object at 0x7f24eaf2e590>, 'o_custkey': <utils.Column object at 0x7f24eaf33910>, 'o_orderstatus': <utils.Column object at 0x7f24eaf33610>, 'o_totalprice': <utils.Column object at 0x7f24eaf31990>, 'o_orderdate': <utils.Column object at 0x7f24eaf30c50>, 'o_orderpriority': <utils.Column object at 0x7f24dc09ef90>, 'o_clerk': <utils.Column object at 0x7f24dc09fd10>, 'o_shippriority': <utils.Column object at 0x7f24dc09c910>, 'o_comment': <utils.Column object at 0x7f24dc

Estimating index creation cost:: 100%|██████████| 581/581 [1:01:03<00:00,  6.31s/it]


Gneretaing all possible configurations with up to 2 indices.


In [5]:
# save copy of all_indices, index_creation_cost, all_configurations
"""
with open('all_indices.pkl', 'wb') as f:
    pickle.dump(all_indices, f)

with open('index_creation_cost.pkl', 'wb') as f:
    pickle.dump(index_creation_cost, f)

with open('all_configurations.pkl', 'wb') as f:
    pickle.dump(all_configurations, f)
"""

# load copy of all_indices, index_creation_cost, all_configurations
"""
with open('all_indices.pkl', 'rb') as f:
    all_indices = pickle.load(f)

with open('index_creation_cost.pkl', 'rb') as f:
    index_creation_cost = pickle.load(f)

with open('all_configurations.pkl', 'rb') as f:
    all_configurations = pickle.load(f)
"""

"\nwith open('all_indices.pkl', 'rb') as f:\n    all_indices = pickle.load(f)\n\nwith open('index_creation_cost.pkl', 'rb') as f:\n    index_creation_cost = pickle.load(f)\n\nwith open('all_configurations.pkl', 'rb') as f:\n    all_configurations = pickle.load(f)\n"

#### Now let's implement WFA

In [20]:
import numpy as np

class WFA:
    def __init__(self, all_configurations, all_indices, index_creation_cost):
        self.all_configurations = all_configurations
        self.all_indices = all_indices
        self.index_creation_cost = index_creation_cost
        #self.best_configurations = []
        #self.best_cost = float('inf')
        self.current_recommendation= None
        self.current_recommendation_id = None
        self.w = np.zeros(shape=(len(all_configurations)))


    def transition(self, C_old, C_new, connection):       
        # find out which indices need to be added and removed
        indices_old = set([i.index_id for i in C_old])
        indices_new = set([i.index_id for i in C_new])
        indices_added = indices_new - indices_old
        indices_added = [self.all_indices[i] for i in indices_added]
        indices_removed = indices_old - indices_new
        indices_removed = [self.all_indices[i] for i in indices_removed]

        # implement configuration change
        total_index_creation_cost = bulk_create_drop_nonclustered_indexes(connection, indices_added, indices_removed)

        return total_index_creation_cost


    # estimation of cost of transition from configuration Ci to Cj
    def get_transition_cost(self, Ci, Cj):
        # find out which indices need to be added and removed
        indices_old = set([i.index_id for i in Ci])
        indices_new = set([i.index_id for i in Cj])
        indices_added = indices_new - indices_old
        indices_added = [self.all_indices[i] for i in indices_added]
        # compute cost of adding indices (assume that cost of removing indices is negligible)
        total_index_creation_cost = sum([self.index_creation_cost[i.index_id] for i in indices_added])
        return total_index_creation_cost
    

    # estimation of cost of executing query/mini-workload q in configuration C_new    
    # q needs to be a list of query strings
    def get_execution_cost(self, q, C_old, C_new, connection, cost_type='hypothetical'):
        if cost_type == 'hypothetical':
            # find out which indices need to be added and removed
            indices_old = set([i.index_id for i in C_old])
            indices_new = set([i.index_id for i in C_new])
            indices_added = indices_new - indices_old
            indices_added = [self.all_indices[i] for i in indices_added]
            indices_removed = indices_old - indices_new
            indices_removed = [self.all_indices[i] for i in indices_removed]

            _, total_execution_cost, _ = hyp_configuration_cost_estimate(connection, indexes_added, indexes_removed, q, verbose=False)

        elif cost_type == 'exact':
            # for exact cost estimation, we will first transition to the configuration C, then execute the query/mini-workload q, then transition back to the original configuration
            C_original = self.current_configuration
            _ = self.transition(C_old, C_new, connection)
            total_execution_cost =  bulk_execute_queries(q, connection)
            _ = self.transition(C_new, C_old, connection)

        else:
            raise ValueError(f"Cost type {cost_type} not supported.")

        return total_execution_cost


    # work function initialization
    def initialize_w(self, i):
        # initial configuration
        C_init = self.all_configurations[i] 
        self.current_recommendation = C_init
        self.current_recommendation_id = i
        for j, X in enumerate(self.all_configurations):
            self.w[j] = self.get_transition_cost(C_init, X) 

    # run 1 step of WFA for given query/mini-workload q to recommend a configuration   
    def recommend(self, q):
        # evaluate work function for all possible configurations
        w_new = np.zeros(shape=(len(self.all_configurations)))
        best_configuration = []
        for i, C_new in enumerate(self.all_configurations):
            # compute all possible transition and service costs
            min_value = float('inf')
            min_j = None
            for j, C_old in enumerate(self.all_configurations):
                if j == i:
                    transition_cost = 0
                else:
                    transition_cost = self.get_transition_cost(C_old, C_new)
                
                service_cost = self.get_execution_cost(q, C_old)
                value = self.w[j] + transition_cost + service_cost
                if value < min_value:
                    min_value = value
                    min_j = j

            # find the minimum work function value
            w_new[i] = min_value
            # store the best configuration
            best_configuration.append(self.all_configurations[min_j])    

        # overwrite the work function values
        self.w = w_new    

        # compute scores for all configurations
        scores = np.zeros(shape=(len(self.all_configurations)))
        min_score = float('inf')
        best_new_configuration_id = None
        for i in range(len(self.all_configurations)):
            scores[i] = self.w[i]  + self.get_transition_cost(self.all_configurations[i], self.current_recommendation)
            if scores[i] < min_score:
                min_score = scores[i]
                best_new_configuration_id = i

        # update the current recommendation
        self.current_recommendation = self.all_configurations[best_new_configuration_id]  
        return self.current_recommendation      



In [None]:
# read workload queries from JSON file
def read_workload(workload_filepath):
    workload = []
    with open(workload_filepath) as f:
        line = f.readline()
        # read the queries from each line
        while line:
            workload.append(json.loads(line))
            line = f.readline()

    return workload

# Base directory containing the generated queries
workload_filepath = '../datagen/TPCH_workloads/TPCH_static_100_workload.json'

# Read the workload queries from file
workload = read_workload(workload_filepath)
print(len(workload))