#### Non-clustered Index configuration generator

Given a query, we will use it's properties (predicates, payload) to generate a list of candidate index configuration that could benefit the execution of that query.

In [27]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# read workload queries from JSON file
def read_workload(workload_filepath):
    workload = []
    with open(workload_filepath) as f:
        line = f.readline()
        # read the queries from each line
        while line:
            workload.append(json.loads(line))
            line = f.readline()

    return workload

# Base directory containing the generated queries
workload_filepath = '../datagen/TPCH_workloads/TPCH_static_100_workload.json'

# Read the workload queries from file
workload = read_workload(workload_filepath)
print(len(workload))

2100


#### To generate candidate index configurations that may benefit a given query, can do the following:

* Look at each table in that query (If the table is too small, then full table scan is cheap so don't need to index. Also, if a table has high "selectivity" and also contains INCLUDE columns, then most likely a large proportion of it's rows will be returned, so again full table scan will be cheap so don't need to index)
* For each of these tables, look at the corresponding predicate columns (these are usually columns under the WHERE clause)    
* Identify the INCLUDE columns, which are columns that are in the payload (payload columns are usually under the SELECT clause) but are not predicate columns, i.e. columns which are needed in the query result but are not used for filtering
* Then generate multicolumn indexes without include columns by enumerating all permutations of the predicate columns, ranging from single-column permutations up to 6-column permutations (indexes on more than 6 columns becomes impractical) 
* Similarly, we generate multicolumn indexes by considering columns that are only in the payload but not in any predicate. Here, we don't need to consider all different column combinations, we can just make a single index for all payload columns for a given table in whatever order, this will mainly just serve as a covering index 
* Finally, we create indexes on tables with both predicate and payload columns. Here we consider indexes on all permutations of the predicate columns as index columns along with the include columns.
* For each index, we also estimate it's value.

In [36]:
class Index:
    def __init__(self, table_name, index_id, index_columns, include_columns=(), value=None):
        self.table_name = table_name
        self.index_id = index_id
        self.index_columns = index_columns
        self.include_columns = include_columns
        self.value = value

    def __str__(self):
        return f"Index({self.table_name}, {self.index_id}, {self.index_columns}, {self.include_columns}, {self.value})"

In [39]:
# constants
SMALL_TABLE_IGNORE = 10000
TABLE_MIN_SELECTIVITY = 0.2

connection = start_connection() 
tables = get_all_tables(connection)
all_columns = get_all_columns(connection)

# get all tables in db
print(f"Tables:")
for key in tables:
    print(tables[key])

print(f"\nAll columns: {all_columns}\n")    

# pick a query from the workload, get it's predicates and payload
i = 1
query = workload[i]
# convert to proper query object
query = Query(connection, query['template_id'], query['query_string'], query['payload'], query['predicates'], query['order_bys'])

query_template_id = query.template_id
query_predicates = query.predicates
query_payload = query.payload
print()
print(f"Query: {query.query_string}")
print()
print(f"Payload: {query_payload}")
print()

indices = []

# indexes on predicate columns only
for table_name, table_predicates in query_predicates.items():
    table = tables[table_name]
    print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
    
    # identify include columns
    include_columns = []
    if table_name in query_payload:
        include_columns = list(set(query_payload[table_name]) - set(table_predicates))

    print(f"Include columns: {include_columns}")
    print(f"Query selectivity: {query.selectivity[table_name]}")


    # check if conditions for cheap full table scan are met
    if table.row_count < SMALL_TABLE_IGNORE or ((query.selectivity[table_name] > TABLE_MIN_SELECTIVITY) and (len(include_columns)>0)):
        print(f"Full table scan for table: {table_name} is cheap, skipping")
        continue

    # generate all possible permutations of predicate columns, from single column up to 6-column indices
    table_predicates = list(table_predicates.keys())[0:6]
    col_permutations = []
    for num_columns in range(1, min(6, len(table_predicates)+1)):
        col_permutations = list(itertools.permutations(table_predicates, num_columns)) 
    
    print(f"Column permutations: \n{col_permutations}")

    # assign an id and value to each index/column permutation
    for cp in col_permutations:
        index_id = get_index_id(cp, table_name)
        print(f"index_id: {index_id}, index columns: {cp}")
        # assign value...
        # create index object
        index = Index(table_name, index_id, cp)
        indices.append(index)



# indexes on columns that are in the payload but not in the predicates
for table_name, table_payload in query_payload.items():
    table = tables[table_name]
    print(f"\nTable --> {table_name}, Payload Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
    
    # skip if any of the payload columns for this table are in the predicates
    if table_name in query_predicates:
        print(f"Payload columns are in the predicates, skipping")
        continue

    # check if conditions for cheap full table scan are met
    if table.row_count < SMALL_TABLE_IGNORE:
        print(f"Full table scan for table: {table_name} is cheap, skipping")
        continue   

    # don't need to consider permutations here, just create an index with all payload columns in given order
    index_id = get_index_id(table_payload, table_name)
    print(f"index_id: {index_id}, index columns: {table_payload}")
    # assign value... (will assign less value to these indices as they are less useful compared to predicate indices)
    indices.append(Index(table_name, index_id, table_payload))

# indexes with include columns
for table_name, table_predicates in query_predicates.items():
    table = tables[table_name]
    print(f"\nTable --> {table_name}, Predicate Columns --> {set(table_predicates)}, table row count --> {table.row_count}")
    
    # check if conditions for cheap full table scan are met
    if table.row_count < SMALL_TABLE_IGNORE:
        print(f"Full table scan for table: {table_name} is cheap, skipping")
        continue  

    # identify include columns
    include_columns = []
    if table_name in query_payload:
        include_columns = sorted(list(set(query_payload[table_name]) - set(table_predicates)))

    if len(include_columns)>0:    
        print(f"Include columns: {include_columns}")

        # generate all possible permutations of predicate columns, from single column up to 6-column indices
        table_predicates = list(table_predicates.keys())[0:6]
        col_permutations = list(itertools.permutations(table_predicates, len(table_predicates))) 
        
        print(f"Column permutations: \n{col_permutations}")

        # assign an id and value to each index/column permutation
        for cp in col_permutations:
            index_id = get_index_id(cp, table_name)
            print(f"index_id: {index_id}, index columns: {cp}, include columns: {include_columns}")
            # assign value...
            # create index object
            index = Index(table_name, index_id, cp, tuple(include_columns))
            indices.append(index)
    

close_connection(connection)

Tables:
Table: customer, Row Count: 150000, PK Columns: ['c_custkey']
Table: orders, Row Count: 1500000, PK Columns: ['o_orderkey']
Table: lineitem, Row Count: 6001215, PK Columns: ['l_linenumber', 'l_orderkey']
Table: part, Row Count: 200000, PK Columns: ['p_partkey']
Table: supplier, Row Count: 10000, PK Columns: ['s_suppkey']
Table: partsupp, Row Count: 800000, PK Columns: ['ps_partkey', 'ps_suppkey']
Table: nation, Row Count: 25, PK Columns: ['n_nationkey']
Table: region, Row Count: 5, PK Columns: ['r_regionkey']

All columns: (defaultdict(<class 'list'>, {'customer': ['c_acctbal', 'c_address', 'c_comment', 'c_custkey', 'c_mktsegment', 'c_name', 'c_nationkey', 'c_phone'], 'orders': ['o_clerk', 'o_comment', 'o_custkey', 'o_orderdate', 'o_orderkey', 'o_orderpriority', 'o_orderstatus', 'o_shippriority', 'o_totalprice'], 'lineitem': ['l_comment', 'l_commitdate', 'l_discount', 'l_extendedprice', 'l_linenumber', 'l_linestatus', 'l_orderkey', 'l_partkey', 'l_quantity', 'l_receiptdate', 'l

In [40]:
for index in indices:
    print(index)

Index(part, IX_part_p_size_p_type, ('p_size', 'p_type'), (), None)
Index(part, IX_part_p_type_p_size, ('p_type', 'p_size'), (), None)
Index(partsupp, IX_partsupp_ps_suppkey_ps_partkey, ('ps_suppkey', 'ps_partkey'), (), None)
Index(partsupp, IX_partsupp_ps_partkey_ps_suppkey, ('ps_partkey', 'ps_suppkey'), (), None)
Index(supplier, IX_supplier_s_nationkey, ('s_nationkey',), (), None)
Index(part, IX_part_p_size_p_type, ('p_size', 'p_type'), ('p_mfgr', 'p_partkey'), None)
Index(part, IX_part_p_type_p_size, ('p_type', 'p_size'), ('p_mfgr', 'p_partkey'), None)
Index(partsupp, IX_partsupp_ps_suppkey_ps_partkey, ('ps_suppkey', 'ps_partkey'), ('ps_supplycost',), None)
Index(partsupp, IX_partsupp_ps_partkey_ps_suppkey, ('ps_partkey', 'ps_suppkey'), ('ps_supplycost',), None)
Index(supplier, IX_supplier_s_nationkey, ('s_nationkey',), ('s_acctbal', 's_address', 's_comment', 's_name', 's_phone'), None)
