#### Non-clustered Index configuration generator

Given a query, we will use it's properties (predicates, payload) to generate a list of candidate index configuration that could benefit the execution of that query.

In [13]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# read workload queries from JSON file
def read_workload(workload_filepath):
    workload = []
    with open(workload_filepath) as f:
        line = f.readline()
        # read the queries from each line
        while line:
            workload.append(json.loads(line))
            line = f.readline()

    return workload

# Base directory containing the generated queries
workload_filepath = '../datagen/TPCH_workloads/TPCH_static_100_workload.json'

# Read the workload queries from file
workload = read_workload(workload_filepath)
print(len(workload))

2100


#### To generate candidate index configurations that may benefit a given query, can do the following:

* Look at each table in that query (If the table is too small, then full table scan is cheap so don't need to index. Also, if a table has high "selectivity" and also contains INCLUDE columns, then most likely a large proportion of it's rows will be returned, so again full table scan will be cheap so don't need to index)
* For each of these tables, look at the corresponding predicate columns (these are usually columns under the WHERE clause)    
* Identify the INCLUDE columns, which are columns that are in the payload (payload columns are usually under the SELECT clause) but are not predicate columns, i.e. columns which are needed in the query result but are not used for filtering
* Then generate multicolumn indexes without include columns by enumerating all permutations of the predicate columns, ranging from single-column permutations up to 6-column permutations (indexes on more than 6 columns becomes impractical) 
* Similarly, we generate multicolumn indexes by considering columns that are only in the payload but not in any predicate. Here, we don't need to consider all different column combinations, we can just make a single index for all payload columns for a given table in whatever order, this will mainly just serve as a covering index 
* Finally, we create indexes on tables with both predicate and payload columns. Here we consider indexes on all permutations of the predicate columns as index columns along with the include columns.
* For each index, we also estimate it's value.

In [3]:
class Index:
    def __init__(self, table_name, index_id, index_columns, size, include_columns=(), value=None):
        self.table_name = table_name
        self.index_id = index_id
        self.index_columns = index_columns
        self.size = size
        self.include_columns = include_columns
        self.value = value

    def __str__(self):
        return f"Index({self.table_name}, {self.index_id}, {self.index_columns}, {self.include_columns}, {self.size}, {self.value})"

#### Pick a test query and generate all it's candidadte indices

In [7]:
# constants
SMALL_TABLE_IGNORE = 10000
TABLE_MIN_SELECTIVITY = 0.2

connection = start_connection() 
tables = get_all_tables(connection)
all_columns = get_all_columns(connection)

# get all tables in db
print(f"Tables:")
for key in tables:
    print(tables[key])

print(f"\nAll columns: {all_columns}\n")    

# pick a query from the workload, get it's predicates and payload
i = 1
query = workload[i]
# convert to proper query object
query = Query(connection, query['template_id'], query['query_string'], query['payload'], query['predicates'], query['order_bys'])

query_template_id = query.template_id
query_predicates = query.predicates
query_payload = query.payload
print()
#print(f"Query: {query.query_string}")
print()
print(f"Payload: {query_payload}")
print()

indices = []

# indexes on predicate columns only
for table_name, table_predicates in query_predicates.items():
    table = tables[table_name]
    print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
    
    # identify include columns
    include_columns = []
    if table_name in query_payload:
        include_columns = list(set(query_payload[table_name]) - set(table_predicates))

    print(f"Include columns: {include_columns}")
    print(f"Query selectivity: {query.selectivity[table_name]}")


    # check if conditions for cheap full table scan are met
    if table.row_count < SMALL_TABLE_IGNORE or ((query.selectivity[table_name] > TABLE_MIN_SELECTIVITY) and (len(include_columns)>0)):
        print(f"Full table scan for table: {table_name} is cheap, skipping")
        continue

    # generate all possible permutations of predicate columns, from single column up to 6-column indices
    table_predicates = list(table_predicates.keys())[0:6]
    col_permutations = []
    for num_columns in range(1, min(6, len(table_predicates)+1)):
        col_permutations = list(itertools.permutations(table_predicates, num_columns)) 
    
    print(f"Column permutations: \n{col_permutations}")

    # assign an id and value to each index/column permutation
    for cp in col_permutations:
        index_id = get_index_id(cp, table_name)
        index_size = get_estimated_index_size(connection, table_name, cp)
        print(f"index_id: {index_id}, index columns: {cp}, index size: {index_size:.2f} Mb")
        # assign value...
        # create index object
        index = Index(table_name, index_id, cp, index_size)
        indices.append(index)



# indexes on columns that are in the payload but not in the predicates
for table_name, table_payload in query_payload.items():
    table = tables[table_name]
    print(f"\nTable --> {table_name}, Payload Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
    
    # skip if any of the payload columns for this table are in the predicates
    if table_name in query_predicates:
        print(f"Payload columns are in the predicates, skipping")
        continue

    # check if conditions for cheap full table scan are met
    if table.row_count < SMALL_TABLE_IGNORE:
        print(f"Full table scan for table: {table_name} is cheap, skipping")
        continue   

    # don't need to consider permutations here, just create an index with all payload columns in given order
    index_id = get_index_id(table_payload, table_name)
    index_size = get_estimated_index_size(connection, table_name, table_payload)
    print(f"index_id: {index_id}, index columns: {table_payload}, index size: {index_size:.2f} Mb")
    # assign value... (will assign less value to these indices as they are less useful compared to predicate indices)
    indices.append(Index(table_name, index_id, table_payload, index_size))

# indexes with include columns
for table_name, table_predicates in query_predicates.items():
    table = tables[table_name]
    print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
    
    # check if conditions for cheap full table scan are met
    if table.row_count < SMALL_TABLE_IGNORE:
        print(f"Full table scan for table: {table_name} is cheap, skipping")
        continue  

    # identify include columns
    include_columns = []
    if table_name in query_payload:
        include_columns = sorted(list(set(query_payload[table_name]) - set(table_predicates)))

    if len(include_columns)>0:    
        print(f"Include columns: {include_columns}")

        # generate all possible permutations of predicate columns, from single column up to 6-column indices
        table_predicates = list(table_predicates.keys())[0:6]
        col_permutations = list(itertools.permutations(table_predicates, len(table_predicates))) 
        
        print(f"Column permutations: \n{col_permutations}")

        # assign an id and value to each index/column permutation
        for cp in col_permutations:
            index_id = get_index_id(cp, table_name, include_columns)
            index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
            print(f"index_id: {index_id}, index columns: {cp}, include columns: {include_columns}, index size: {index_size:.2f} Mb")
            # assign value...
            # create index object
            index = Index(table_name, index_id, cp, index_size, tuple(include_columns))
            indices.append(index)
    

close_connection(connection)

Tables:
Table: customer, Row Count: 150000, PK Columns: ['c_custkey']
Table: orders, Row Count: 1500000, PK Columns: ['o_orderkey']
Table: lineitem, Row Count: 6001215, PK Columns: ['l_linenumber', 'l_orderkey']
Table: part, Row Count: 200000, PK Columns: ['p_partkey']
Table: supplier, Row Count: 10000, PK Columns: ['s_suppkey']
Table: partsupp, Row Count: 800000, PK Columns: ['ps_partkey', 'ps_suppkey']
Table: nation, Row Count: 25, PK Columns: ['n_nationkey']
Table: region, Row Count: 5, PK Columns: ['r_regionkey']

All columns: (defaultdict(<class 'list'>, {'customer': ['c_acctbal', 'c_address', 'c_comment', 'c_custkey', 'c_mktsegment', 'c_name', 'c_nationkey', 'c_phone'], 'orders': ['o_clerk', 'o_comment', 'o_custkey', 'o_orderdate', 'o_orderkey', 'o_orderpriority', 'o_orderstatus', 'o_shippriority', 'o_totalprice'], 'lineitem': ['l_comment', 'l_commitdate', 'l_discount', 'l_extendedprice', 'l_linenumber', 'l_linestatus', 'l_orderkey', 'l_partkey', 'l_quantity', 'l_receiptdate', 'l

In [11]:
# generates candidate indices using predicates and payload of a given query (should be a Query object)
def generate_indices(connection, query, SMALL_TABLE_IGNORE=10000, TABLE_MIN_SELECTIVITY=0.2, verbose=False):
    # get all tables in the db
    tables = get_all_tables(connection)
    if verbose:
        print(f"Tables:")
        for key in tables:
            print(tables[key])

    query_template_id = query.template_id
    query_predicates = query.predicates
    query_payload = query.payload
    
    indices = {}

    # indexes on predicate columns only
    for table_name, table_predicates in query_predicates.items():
        table = tables[table_name]
        if verbose: print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
        
        # identify include columns
        include_columns = []
        if table_name in query_payload:
            include_columns = list(set(query_payload[table_name]) - set(table_predicates))
        
        if verbose: 
            print(f"Include columns: {include_columns}")
            print(f"Query selectivity: {query.selectivity[table_name]}")


        # check if conditions for cheap full table scan are met
        if table.row_count < SMALL_TABLE_IGNORE or ((query.selectivity[table_name] > TABLE_MIN_SELECTIVITY) and (len(include_columns)>0)):
            if verbose: print(f"Full table scan for table: {table_name} is cheap, skipping")
            continue

        # generate all possible permutations of predicate columns, from single column up to 6-column indices
        table_predicates = list(table_predicates.keys())[0:6]
        col_permutations = []
        for num_columns in range(1, min(6, len(table_predicates)+1)):
            col_permutations = list(itertools.permutations(table_predicates, num_columns)) 
        
        if verbose: print(f"Column permutations: \n{col_permutations}")

        # assign an id and value to each index/column permutation
        for cp in col_permutations:
            index_id = get_index_id(cp, table_name)
            
            if index_id not in indices:
                index_size = get_estimated_index_size(connection, table_name, cp)
                if verbose:  print(f"index_id: {index_id}, index columns: {cp}, index size: {index_size:.2f} Mb")
                # assign value...

                # create index object
                indices[index_id] = Index(table_name, index_id, cp, index_size)

    # indexes on columns that are in the payload but not in the predicates
    for table_name, table_payload in query_payload.items():
        table = tables[table_name]
        if verbose: print(f"\nTable --> {table_name}, Payload Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
        
        # skip if any of the payload columns for this table are in the predicates
        if table_name in query_predicates:
            if verbose: print(f"Payload columns are in the predicates, skipping")
            continue

        # check if conditions for cheap full table scan are met
        if table.row_count < SMALL_TABLE_IGNORE:
            if verbose: print(f"Full table scan for table: {table_name} is cheap, skipping")
            continue   

        # don't need to consider permutations here, just create an index with all payload columns in given order
        index_id = get_index_id(table_payload, table_name)
        if index_id not in indices:
            index_size = get_estimated_index_size(connection, table_name, table_payload)
            print(f"index_id: {index_id}, index columns: {table_payload}, index size: {index_size:.2f} Mb")
            # assign value... (will assign less value to these indices as they are less useful compared to predicate indices)
            
            indices[index_id] = Index(table_name, index_id, table_payload, index_size)

    # indexes with include columns
    for table_name, table_predicates in query_predicates.items():
        table = tables[table_name]
        if verbose: print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
        
        # check if conditions for cheap full table scan are met
        if table.row_count < SMALL_TABLE_IGNORE:
            if verbose: print(f"Full table scan for table: {table_name} is cheap, skipping")
            continue  

        # identify include columns
        include_columns = []
        if table_name in query_payload:
            include_columns = sorted(list(set(query_payload[table_name]) - set(table_predicates)))

        if len(include_columns)>0:    
            if verbose: print(f"Include columns: {include_columns}")

            # generate all possible permutations of predicate columns, from single column up to 6-column indices
            table_predicates = list(table_predicates.keys())[0:6]
            col_permutations = list(itertools.permutations(table_predicates, len(table_predicates))) 
            
            if verbose: print(f"Column permutations: \n{col_permutations}")

            # assign an id and value to each index/column permutation
            for cp in col_permutations:
                index_id = get_index_id(cp, table_name, include_columns)
                if index_id not in indices:
                    index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
                    if verbose: print(f"index_id: {index_id}, index columns: {cp}, include columns: {include_columns}, index size: {index_size:.2f} Mb")
                    # assign value...
                    
                    # create index object
                    indices[index_id] = Index(table_name, index_id, cp, index_size, tuple(include_columns))
            
    return indices        


In [12]:
connection = start_connection() 

# generate candidate indices for all queries in first round (i.e. first 21 queries in TPC-H static workload)
candidate_indices = {}
for query in workload[0:21]:
    # convert to Query object
    query = Query(connection, query['template_id'], query['query_string'], query['payload'], query['predicates'], query['order_bys'])
    indices = generate_indices(connection, query, verbose=False)
    for index_id, index in indices.items():
        if index_id not in candidate_indices:
            candidate_indices[index_id] = index

close_connection(connection)

In [14]:
from tqdm import tqdm

print(f"Total number of candidate indices: {len(candidate_indices)}")
print(f"Total size of all candidate indices: {sum([index.size for index in candidate_indices.values()]):.2f} Mb")

connection = start_connection() 

# for each candidate index, calculate the cost of the query with and without the index, i.e. the estimated benefit of the index
# assuming the starting configuration is no indices
candidate_indices_values = defaultdict(float)
for index_id in tqdm(candidate_indices.keys()):
    for query in workload[0:21]:
        indexes_added = [candidate_indices[index_id]]
        indexes_removed = [] 
        total_orig_cost, total_hyp_cost, hyp_index_creation_cost = hyp_configuration_cost_estimate(connection, indexes_added, indexes_removed, [query['query_string']], verbose=False)
        # compute the value, defined as the estimated time saved by using the index
        value = total_orig_cost - total_hyp_cost
        # add up the values for all the queries 
        candidate_indices_values[index_id] += value

close_connection(connection)        

Total number of candidate indices: 394
Total size of all candidate indices: 63134.69 Mb


  2%|▏         | 9/394 [01:19<54:50,  8.55s/it]  

In [8]:
print(f"Candidate indices:")
for index in indices:
    print(index)

Candidate indices:
Index(part, IX_part_p_size_p_type, ('p_size', 'p_type'), (), 7.43865966796875, None)
Index(part, IX_part_p_type_p_size, ('p_type', 'p_size'), (), 7.43865966796875, None)
Index(partsupp, IX_partsupp_ps_suppkey_ps_partkey, ('ps_suppkey', 'ps_partkey'), (), 11.444091796875, None)
Index(partsupp, IX_partsupp_ps_partkey_ps_suppkey, ('ps_partkey', 'ps_suppkey'), (), 11.444091796875, None)
Index(supplier, IX_supplier_s_nationkey, ('s_nationkey',), (), 0.1430511474609375, None)
Index(part, IXN_part_p_size_p_type_p_mf_p_pa, ('p_size', 'p_type'), ('p_mfgr', 'p_partkey'), 12.20703125, None)
Index(part, IXN_part_p_type_p_size_p_mf_p_pa, ('p_type', 'p_size'), ('p_mfgr', 'p_partkey'), 12.20703125, None)
Index(partsupp, IXN_partsupp_ps_suppkey_ps_partkey_ps_s, ('ps_suppkey', 'ps_partkey'), ('ps_supplycost',), 18.310546875, None)
Index(partsupp, IXN_partsupp_ps_partkey_ps_suppkey_ps_s, ('ps_partkey', 'ps_suppkey'), ('ps_supplycost',), 18.310546875, None)
Index(supplier, IXN_supplier

#### Test index creations

In [9]:
connection = start_connection() 

# get list of all indexes currently in the database
existing_indexes = get_nonclustered_indexes(connection)
existing_indexes = [x[2] for x in existing_indexes]
print(f"\nExisting indexes: {existing_indexes}\n")

# create all candidadte indexes for the test query
index_creation_time = 0
index_size = 0
for index in indices[:4]:
    if index.index_id not in existing_indexes:
        index_creation_time += create_nonclustered_index_object(connection, index, verbose=True)
        index_size += index.size
    else:
        print(f"Index {index.index_id} already exists, skipping")

print(f"\nTotal index creation time: {index_creation_time:.2f} seconds, Total Size: {index_size} Mb")

close_connection(connection)


Existing indexes: []

Created index --> [dbo].[part].[IX_part_p_size_p_type], Indexed Columns --> ('p_size', 'p_type'), Included Columns --> (), index creation time: 0.658 seconds
Created index --> [dbo].[part].[IX_part_p_type_p_size], Indexed Columns --> ('p_type', 'p_size'), Included Columns --> (), index creation time: 0.682 seconds
Created index --> [dbo].[partsupp].[IX_partsupp_ps_suppkey_ps_partkey], Indexed Columns --> ('ps_suppkey', 'ps_partkey'), Included Columns --> (), index creation time: 0.846 seconds
Created index --> [dbo].[partsupp].[IX_partsupp_ps_partkey_ps_suppkey], Indexed Columns --> ('ps_partkey', 'ps_suppkey'), Included Columns --> (), index creation time: 0.455 seconds

Total index creation time: 2.64 seconds, Total Size: 37.7655029296875 Mb


#### Test Index dropping

In [10]:
connection = start_connection() 

# get list of all indexes currently in the database
existing_indexes = get_nonclustered_indexes(connection)
existing_indexes = [x[2] for x in existing_indexes]
print(f"\nExisting indexes: {existing_indexes}\n")

# create all candidadte indexes for the test query
for index in indices:
    if index.index_id in existing_indexes:
        drop_noncluster_index_object(connection, index, verbose=True)
    else:
        print(f"Index {index.index_id} does not exists, skipping")

close_connection(connection)


Existing indexes: ['IX_part_p_size_p_type', 'IX_part_p_type_p_size', 'IX_partsupp_ps_partkey_ps_suppkey', 'IX_partsupp_ps_suppkey_ps_partkey']

Dropped index --> [dbo].[part].[IX_part_p_size_p_type]
Dropped index --> [dbo].[part].[IX_part_p_type_p_size]
Dropped index --> [dbo].[partsupp].[IX_partsupp_ps_suppkey_ps_partkey]
Dropped index --> [dbo].[partsupp].[IX_partsupp_ps_partkey_ps_suppkey]
Index IX_supplier_s_nationkey does not exists, skipping
Index IXN_part_p_size_p_type_p_mf_p_pa does not exists, skipping
Index IXN_part_p_type_p_size_p_mf_p_pa does not exists, skipping
Index IXN_partsupp_ps_suppkey_ps_partkey_ps_s does not exists, skipping
Index IXN_partsupp_ps_partkey_ps_suppkey_ps_s does not exists, skipping
Index IXN_supplier_s_nationkey_s_ac_s_ad_s_co_s_na_s_ph does not exists, skipping


#### Test hypothetical query execution cost with indices vs without

In [104]:
connection = start_connection() 

indexes_added = indices[:4]
indexes_removed = [] #indices[4:]
print(hyp_configuration_cost_estimate(connection, indexes_added, indexes_removed,  [query.query_string], verbose=True))

close_connection(connection)

Created hypothetical index --> [part].[IX_part_p_size_p_type]
Created hypothetical index --> [part].[IX_part_p_type_p_size]
Created hypothetical index --> [partsupp].[IX_partsupp_ps_suppkey_ps_partkey]
Created hypothetical index --> [partsupp].[IX_partsupp_ps_partkey_ps_suppkey]
Enabling hypothetical index: (5, 997578592, 7)
Enabling hypothetical index: (5, 997578592, 8)
Enabling hypothetical index: (5, 1061578820, 4)
Enabling hypothetical index: (5, 1061578820, 5)
Dropped hypothetical index: dbo.part.IX_part_p_size_p_type
Dropped hypothetical index: dbo.part.IX_part_p_type_p_size
Dropped hypothetical index: dbo.partsupp.IX_partsupp_ps_suppkey_ps_partkey
Dropped hypothetical index: dbo.partsupp.IX_partsupp_ps_partkey_ps_suppkey
(16.6491, 12.6178, None)


#### Materialse the hypothetical configuration and measure actual query cost

In [99]:
connection = start_connection() 

config_indexes = indices[:4]


# get list of all indexes currently in the database
existing_indexes = get_nonclustered_indexes(connection)
existing_indexes = [x[2] for x in existing_indexes]
print(f"\nExisting indexes: {existing_indexes}\n")

# create all candidadte indexes for the test query
index_creation_time = 0
index_size = 0
for index in config_indexes:
    if index.index_id not in existing_indexes:
        index_creation_time += create_nonclustered_index_object(connection, index, verbose=True)
        index_size += index.size
    else:
        print(f"Index {index.index_id} already exists, skipping")

print(f"\nTotal index creation time: {index_creation_time:.2f} seconds, Total Size: {index_size} Mb")


# execute the query and get the execution time
print(execute_query(query.query_string, connection, cost_type='elapsed_time')[0])
print(estimate_query_cost(connection, query.query_string)[0])


close_connection(connection)


Existing indexes: ['IX_part_p_size_p_type', 'IX_part_p_type_p_size', 'IX_partsupp_ps_partkey_ps_suppkey', 'IX_partsupp_ps_suppkey_ps_partkey']

Index IX_part_p_size_p_type already exists, skipping
Index IX_part_p_type_p_size already exists, skipping
Index IX_partsupp_ps_suppkey_ps_partkey already exists, skipping
Index IX_partsupp_ps_partkey_ps_suppkey already exists, skipping

Total index creation time: 0.00 seconds, Total Size: 0 Mb
0.215
16.3005


In [78]:
print(query.query_string)


select
	s_acctbal,
	s_name,
	n_name,
	p_partkey,
	p_mfgr,
	s_address,
	s_phone,
	s_comment
from
	part,
	supplier,
	partsupp,
	nation,
	region
where
	p_partkey = ps_partkey
	and s_suppkey = ps_suppkey
	and p_size = 26
	and p_type like '%NICKEL'
	and s_nationkey = n_nationkey
	and n_regionkey = r_regionkey
	and r_name = 'EUROPE'
	and ps_supplycost = (
		select
			min(ps_supplycost)
		from
			partsupp,
			supplier,
			nation,
			region
		where
			p_partkey = ps_partkey
			and s_suppkey = ps_suppkey
			and s_nationkey = n_nationkey
			and n_regionkey = r_regionkey
			and r_name = 'EUROPE'
	)
order by
	s_acctbal desc,
	n_name,
	s_name,
	p_partkey

;
