#### WFIT Algorithm Implementation (Schnaitter 2011)

In [3]:
%load_ext autoreload
%autoreload 2

import os, sys
import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'PostgreSQL'))
sys.path.append(target_subdirectory_path)

from pg_utils import *
from ssb_qgen_class import *

from collections import defaultdict
from functools import lru_cache
import time
import random
from more_itertools import powerset
from itertools import chain
from tqdm import tqdm
import concurrent.futures


#### Index Benefit Graph (IBG)

In [18]:
class Node:
    def __init__(self, id, indexes):
        self.id = id
        self.indexes = indexes
        self.children = []
        self.parents = []
        self.built = False
        self.cost = None
        self.used = None


# class for creating and storing the IBG
class IBG:
    # Class-level cache
    #_class_cache = {}

    def __init__(self, query_object, C):
        self.q = query_object
        self.C = C
        print(f"Number of candidate indexes: {len(self.C)}")
        #print(f"Candidate indexes: {self.C}")
        
        # map index_id to integer
        self.idx2id = {index.index_id:i for i, index in enumerate(self.C)}
        self.idx2index = {index.index_id:index for index in self.C}
        print(f"Index id to integer mapping: {self.idx2id}")
        
        # create a hash table for keeping track of all created nodes
        self.nodes = {}
        # create a root node
        self.root = Node(self.get_configuration_id(self.C), self.C)
        self.nodes[self.root.id] = self.root
        print(f"Created root node with id: {self.root.id}")
        
        self.total_whatif_calls = 0
        self.total_whatif_time = 0
        self.node_count = 0

        # start the IBG construction
        print("Constructing IBG...")
        self.construct_ibg(self.root)
        print(f"Number of nodes in IBG: {len(self.nodes)}, Total number of what-if calls: {self.total_whatif_calls}, Time spent on what-if calls: {self.total_whatif_time}")
        # compute all pair degree of interaction
        print(f"Computing all pair degree of interaction...")
        start_time = time.time()
        #self.doi = self.compute_all_pair_doi()
        #self.doi = self.compute_all_pair_doi_parallel()
        #self.doi = self.compute_all_pair_doi_simple()
        self.doi = self.compute_all_pair_doi_naive()
        end_time = time.time()
        print(f"Time spent on computing all pair degree of interaction: {end_time - start_time}")

    # assign unique string id to a configuration
    def get_configuration_id(self, indexes):
        # get sorted list of integer ids
        ids = sorted([self.idx2id[idx.index_id] for idx in indexes])
        return "_".join([str(i) for i in ids])
    
    def _get_cost_used(self, indexes):
        # Convert indexes to a tuple to make it hashable
        #indexes_tuple = tuple(sorted(indexes, key=lambda x: x.index_id))
        # Check if the result is already in the class-level cache
        #if indexes_tuple in self._class_cache:
        #    return self._class_cache[indexes_tuple]
        
        start_time = time.time()
        conn = create_connection()
        # create hypothetical indexes
        hypo_indexes = bulk_create_hypothetical_indexes(conn, indexes)
        # map oid to index object
        oid2index = {}
        for i in range(len(hypo_indexes)):
            oid2index[hypo_indexes[i]] = indexes[i]
        # get cost and used indexes
        cost, indexes_used = get_query_cost_estimate_hypo_indexes(conn, self.q.query_string, show_plan=False)
        # map used index oids to index objects
        used = [oid2index[oid] for oid, scan_type, scan_cost in indexes_used]
        # drop hypothetical indexes
        bulk_drop_hypothetical_indexes(conn)
        close_connection(conn)
        end_time = time.time()

        # Store the result in the class-level cache
        #self._class_cache[indexes_tuple] = (cost, used)
        self.total_whatif_calls += 1
        self.total_whatif_time += end_time - start_time

        return cost, used

    # Ensure the indexes parameter is hashable
    def _cached_get_cost_used(self, indexes):
        return self._get_cost_used(tuple(indexes))

    # recursive IBG construction algorithm
    def construct_ibg(self, Y):
        if Y.built:
            return 
        
        # obtain query optimizers cost and used indexes
        #print(f"Creating node for configuration: {[idx.index_id for idx in Y.indexes]}")
        self.node_count += 1
        print(f"Creating node # {self.node_count}", end="\r")

        cost, used = self._cached_get_cost_used(Y.indexes)
        Y.cost = cost
        Y.used = used
        Y.built = True
        
        #print(f"Cost: {cost}, Used indexes: {[idx.index_id for idx in used]}")
        #for idx in used:
        #    print(f"{idx}")

        # create children
        for a in Y.used:
            # create a new configuration with index a removed from Y
            X_indexes = [index for index in Y.indexes if index != a]
            X_id = self.get_configuration_id(X_indexes)
            
            # if X is not in the hash table, create a new node and recursively build it
            if X_id not in self.nodes:
                X = Node(X_id, X_indexes)
                X.parents.append(Y)
                self.nodes[X_id] = X
                Y.children.append(X)
                self.construct_ibg(X)

            else:
                X = self.nodes[X_id]
                Y.children.append(X)
                X.parents.append(Y)


    # use IBG to obtain estimated cost and used indexes for arbitrary subset of C
    def get_cost_used(self, X):
        # get id of the configuration
        id = self.get_configuration_id(X)
        # check if the configuration is in the IBG
        if id in self.nodes:
            cost, used = self.nodes[id].cost, self.nodes[id].used
        
        # if not in the IBG, traverse the IBG to find a covering node
        else:
            Y = self.find_covering_node(X)              
            cost, used = Y.cost, Y.used

        return cost, used    


    # traverses the IBG to find a node that removes indexes not in X (i.e. a covering node for X)
    def find_covering_node(self, X):
        X_indexes = set([index.index_id for index in X])
        Y = self.root
        Y_indexes = set([index.index_id for index in Y.indexes])
        # traverse IBG to find covering node
        while (len(Y_indexes - X_indexes) != 0) or (len(Y.children) > 0):               
            # traverse down to the child node that removes an index not in X
            child_found = False
            for child in Y.children:
                child_indexes = set([index.index_id for index in child.indexes])
                child_indexes_removed = Y_indexes - child_indexes
                child_indexes_removed_not_in_X = child_indexes_removed - X_indexes
        
                # check if child removes an index not in X
                if len(child_indexes_removed_not_in_X) > 0:
                    Y = child
                    Y_indexes = child_indexes
                    child_found = True
                    break

            # if no children remove indexes not in X    
            if not child_found:
                break    
    
        return Y        

    # compute benefit of an index for a given configuration 
    # input X is a list of index objects and 'a' is a single index object
    # X must not contain 'a'
    def compute_benefit(self, a, X):
        if a in X:
            # zero benefit if 'a' is already in X
            #raise ValueError("Index 'a' is already in X")
            return 0
        
        # get cost  for X
        cost_X = self.get_cost_used(X)[0]
        # create a new configuration with index a added to X
        X_a = X + [a]
        # get cost for X + {a}
        cost_X_a = self.get_cost_used(X_a)[0]
        # compute benefit
        benefit = cost_X - cost_X_a
        return benefit 


    # compute maximum benefit of adding an index to any possibe configuration
    def compute_max_benefit(self, a):
        max_benefit = float('-inf')
        for id, node in self.nodes.items():
            #print(f"Computing benefit for node: {[index.index_id for index in node.indexes]}")
            benefit = self.compute_benefit(a, node.indexes)
            if benefit > max_benefit:
                max_benefit = benefit

        return max_benefit
    
    # compute the degree of interaction between two indexes a,b in configuration X 
    def compute_doi_configuration(self, a, b, X=[], normalize=True):
        # X must not contain a or b
        if a in X or b in X:
            raise ValueError("a or b is already in X")

        doi = abs(self.compute_benefit(a, X) - self.compute_benefit(a, X + [b]))
        if normalize:
            doi /= self.get_cost_used(X + [a,b])[0]   
        return doi
   
    
    # Cache the results of find_covering_node and get_cost_used to avoid redundant calculations
    @lru_cache(maxsize=None)
    def cached_find_covering_node(self, indexes):
        return self.find_covering_node(tuple(indexes))

    @lru_cache(maxsize=None)
    def cached_get_cost_used(self, indexes):
        return self.get_cost_used(tuple(indexes))



    # computes the degree of interaction between all pairs of indexes (a,b) in candidate set C
    # Note: doi is symmetric, i.e. doi(a,b) = doi(b,a)

    # simple version of compute_all_pair_doi, without parallelization
    def compute_all_pair_doi_simple(self):
        # hash table for storing doi values
        doi = {}
        # intialize doi values to zero
        for i in range(len(self.C)):
            for j in range(i+1, len(self.C)):
                d = self.compute_doi_configuration(self.C[i], self.C[j])
                doi[(self.C[i].index_id, self.C[j].index_id)] = d
                doi[(self.C[j].index_id, self.C[i].index_id)] = d

        return doi

    # Naive version of compute_all_pair_doi, with random sampling of configurations
    def compute_all_pair_doi_naive(self, num_samples=100):
        doi = {}
        
        for i in range(len(self.C)):
            for j in range(i + 1, len(self.C)):
                doi[(self.C[i].index_id, self.C[j].index_id)] = 0
                doi[(self.C[j].index_id, self.C[i].index_id)] = 0
        
        # sample random configurations: X subset C (must include empty set configuration)
        for i in tqdm(range(num_samples), desc="Sampling configurations"):
            if i == 0:
                X = []
            else:
                X = random.sample(self.C, random.randint(1, len(self.C)))

            # compute doi for all pairs (a, b) in U\X 
            for i in range(len(self.C)):
                for j in range(i+1, len(self.C)):
                    a = self.C[i]
                    b = self.C[j]
                    if a not in X and b not in X:
                        d = self.compute_doi_configuration(a, b, X)
                        doi[(a.index_id, b.index_id)] = max(doi[(a.index_id, b.index_id)], d)
                        doi[(b.index_id, a.index_id)] = max(doi[(b.index_id, a.index_id)], d)        
        
        return doi    

    # original version of compute_all_pair_doi, with optional max_iters parameter
    def compute_all_pair_doi(self, max_iters=None):
        # hash table for storing doi values
        doi = {}
        # intialize doi values to zero
        for i in range(len(self.C)):
            for j in range(i+1, len(self.C)):
                doi[(self.C[i].index_id, self.C[j].index_id)] = 0
                doi[(self.C[j].index_id, self.C[i].index_id)] = 0

        S_idxs = set([index.index_id for index in self.C])

        # iterate over each IBG node
        iter_count = 0
        for Y in tqdm(self.nodes.values(), desc="Processing nodes"):
            if max_iters is not None and iter_count >= max_iters:
                break
        
            iter_count += 1
            
            # remove Y.used from S
            Y_idxs = set([index.index_id for index in Y.indexes])
            used_Y = Y.used
            Y_used_idxs = set([index.index_id for index in used_Y])
            S_Y = list(S_idxs - Y_used_idxs)
            # iterate over all pairs of indexes in S_Y
            for i in range(len(S_Y)):
                for j in range(i+1, len(S_Y)):
                    a_idx = S_Y[i]
                    b_idx = S_Y[j]
                     
                    # find Ya covering node in IBG
                    Ya = (Y_idxs - {a_idx, b_idx}) | {a_idx}
                    Ya = [self.idx2index[idx] for idx in Ya]
                    Ya = self.cached_find_covering_node(tuple(Ya))
                    # find Yab covering node in IBG
                    Yab = (Y_idxs - {a_idx, b_idx}) | {a_idx, b_idx}
                    Yab = [self.idx2index[idx] for idx in Yab]
                    Yab = self.cached_find_covering_node(tuple(Yab))

                    #used_Y = self.cached_get_cost_used(tuple(Y.indexes))[1]
                    #used_Ya = self.cached_get_cost_used(tuple(Ya))[1]
                    #used_Yab = self.cached_get_cost_used(tuple(Yab))[1]
                    used_Ya = Ya.used
                    used_Yab = Yab.used

                    Uab = set([index.index_id for index in used_Y]) | set([index.index_id for index in used_Ya]) | set([index.index_id for index in used_Yab]) 
                    # find Yb_minus covering node in IBG 
                    Yb_minus = list((Uab - {a_idx, b_idx}) | {b_idx})
                    Yb_minus = [self.idx2index[idx] for idx in Yb_minus]
                    Yb_minus = self.cached_find_covering_node(tuple(Yb_minus))
                    # find Yb_plus covering node in IBG
                    Yb_plus = list((Y_idxs - {a_idx, b_idx}) | {b_idx})
                    Yb_plus = [self.idx2index[idx] for idx in Yb_plus]
                    Yb_plus = self.cached_find_covering_node(tuple(Yb_plus))

                    # generate quadruples
                    quadruples = [(Y.indexes, Ya.indexes, Yb_minus.indexes, Yab.indexes), (Y.indexes, Ya.indexes, Yb_plus.indexes, Yab.indexes)]

                    # compute doi using the quadruples
                    for Y_indexes, Ya_indexes, Yb_indexes, Yab_indexes in quadruples:
                        cost_Y = self.cached_get_cost_used(tuple(Y_indexes))[0]
                        cost_Ya = self.cached_get_cost_used(tuple(Ya_indexes))[0]
                        cost_Yb = self.cached_get_cost_used(tuple(Yb_indexes))[0]
                        cost_Yab = self.cached_get_cost_used(tuple(Yab_indexes))[0]
                        # can ignore the normalization terms in denominator to get an absolute measure of doi
                        d = abs(cost_Y - cost_Ya - cost_Yb + cost_Yab) / cost_Yab
                        # save doi value for the pair
                        doi[(a_idx,b_idx)] = max(doi[(a_idx,b_idx)], d)
                        # save doi value for the symmetric pair
                        doi[(b_idx,a_idx)] = max(doi[(b_idx,a_idx)], d)     
                            
        return doi


    # parallelized version of compute_all_pair_doi
    def compute_all_pair_doi_parallel(self, chunk_size=32, batch_size=16, max_iters=None):
        doi = {}
        
        for i in range(len(self.C)):
            for j in range(i + 1, len(self.C)):
                doi[(self.C[i].index_id, self.C[j].index_id)] = 0
                doi[(self.C[j].index_id, self.C[i].index_id)] = 0
        
        S_idxs = set([index.index_id for index in self.C])
        
        nodes_list = list(self.nodes.values())
        chunks = [nodes_list[i:i + chunk_size] for i in range(0, len(nodes_list), chunk_size)]
        
        args = [(chunk, self.C, self.idx2index, S_idxs, self.cached_find_covering_node, self.cached_get_cost_used, max_iters) for chunk in chunks]
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
            results = list(tqdm(executor.map(process_node_chunk, args), total=len(chunks), desc="Processing nodes in parallel"))
        
        for result in results:
            for key, value in result.items():
                doi[key] = max(doi.get(key, 0), value)
        
        return doi
    
    
    # get precomputed degree of interaction between a pair of indexes
    def get_doi_pair(self, a, b):
            return self.doi[(a.index_id, b.index_id)]


    # function for printing the IBG, using BFS level order traversal
    def print_ibg(self):
        q = [self.root]
        # traverse level by level, print all node ids in a level in a single line before moving to the next level
        while len(q) > 0:
            next_q = []
            for node in q:
                print(f"{node.id} -> ", end="")
                for child in node.children:
                    next_q.append(child)
            print()
            q = next_q  


def process_node_chunk(args):
    nodes_chunk, C, idx2index, S_idxs, cached_find_covering_node, cached_get_cost_used, max_iters = args
    doi_chunk = {}
    
    iter_count = 0
    for Y in nodes_chunk:
        if max_iters is not None and iter_count >= max_iters:
            break
        
        iter_count += 1
        Y_idxs = set([index.index_id for index in Y.indexes])
        used_Y = Y.used
        Y_used_idxs = set([index.index_id for index in used_Y])
        S_Y = list(S_idxs - Y_used_idxs)
        
        for i in range(len(S_Y)):
            for j in range(i + 1, len(S_Y)):
                a_idx = S_Y[i]
                b_idx = S_Y[j]
                
                Ya = (Y_idxs - {a_idx, b_idx}) | {a_idx}
                Ya = [idx2index[idx] for idx in Ya]
                Ya = cached_find_covering_node(tuple(Ya))
                
                Yab = (Y_idxs - {a_idx, b_idx}) | {a_idx, b_idx}
                Yab = [idx2index[idx] for idx in Yab]
                Yab = cached_find_covering_node(tuple(Yab))
                
                used_Ya = Ya.used
                used_Yab = Yab.used
                
                Uab = set([index.index_id for index in used_Y]) | set([index.index_id for index in used_Ya]) | set([index.index_id for index in used_Yab])
                
                Yb_minus = list((Uab - {a_idx, b_idx}) | {b_idx})
                Yb_minus = [idx2index[idx] for idx in Yb_minus]
                Yb_minus = cached_find_covering_node(tuple(Yb_minus))
                
                Yb_plus = list((Y_idxs - {a_idx, b_idx}) | {b_idx})
                Yb_plus = [idx2index[idx] for idx in Yb_plus]
                Yb_plus = cached_find_covering_node(tuple(Yb_plus))
                
                quadruples = [(Y.indexes, Ya.indexes, Yb_minus.indexes, Yab.indexes), (Y.indexes, Ya.indexes, Yb_plus.indexes, Yab.indexes)]
                
                for Y_indexes, Ya_indexes, Yb_indexes, Yab_indexes in quadruples:
                    cost_Y = cached_get_cost_used(tuple(Y_indexes))[0]
                    cost_Ya = cached_get_cost_used(tuple(Ya_indexes))[0]
                    cost_Yb = cached_get_cost_used(tuple(Yb_indexes))[0]
                    cost_Yab = cached_get_cost_used(tuple(Yab_indexes))[0]
                    
                    d = abs(cost_Y - cost_Ya - cost_Yb + cost_Yab) / cost_Yab
                    doi_chunk[(a_idx, b_idx)] = max(doi_chunk.get((a_idx, b_idx), 0), d)
                    doi_chunk[(b_idx, a_idx)] = max(doi_chunk.get((b_idx, a_idx), 0), d)
    
    return doi_chunk

In [5]:
# create an SSB query generator object
qg = QGEN()

In [16]:
# test IBG 

query = qg.generate_query(1)
print(query)

C = extract_query_indexes(qg.generate_query(1), include_cols=False)  

ibg = IBG(query, C)

ibg.print_ibg()

# pick random subset of candidate indexes
X = random.sample(ibg.C, 8)
cost, used = ibg.get_cost_used(X)
print(f"IBG     --> Cost: {cost}, Used indexes: {[idx.index_id for idx in used]}")

cost, used = ibg._cached_get_cost_used(X)
print(f"What-if --> Cost: {cost}, Used indexes: {[idx.index_id for idx in used]}")

# pick two indexes and a configuration
a = ibg.C[0]
b = ibg.C[4] 
X = [ibg.C[1], ibg.C[2], ibg.C[5], ibg.C[6], ibg.C[8]]

# compute maximum benefit of adding index 'a' 
max_benefit = ibg.compute_max_benefit(a)
print(f"\nMaximum benefit of adding index {a.index_id}: {max_benefit}")

# compute degree of interaction between indexes 'a' and 'b' in configuration X
doi = ibg.compute_doi_configuration(a, b, X)
print(f"\nDOI between indexes {a.index_id} and {b.index_id} : {doi}")
print(f"in configuration {[idx.index_id for idx in X]}")

# compute configuration independent degree of interaction between indexes 'a' and 'b'
doi = ibg.get_doi_pair(a, b)
print(f"\nDOI between indexes {a.index_id} and {b.index_id} : {doi}")

template id: 1, query: 
                SELECT SUM(lo_extendedprice * lo_discount) AS revenue
                FROM lineorder, dwdate
                WHERE lo_orderdate = d_datekey
                AND d_year = 1993
                AND lo_discount BETWEEN 6 AND 8 
                AND lo_quantity < 14;
            , payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}, predicates: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_quantity'], 'dwdate': ['d_datekey', 'd_year']}, order by: {}, group by: {}
Number of candidate indexes: 19
Index id to integer mapping: {'IX_lineorder_lo_orderdate': 0, 'IX_lineorder_lo_discount': 1, 'IX_lineorder_lo_quantity': 2, 'IX_lineorder_lo_orderdate_lo_discount': 3, 'IX_lineorder_lo_orderdate_lo_quantity': 4, 'IX_lineorder_lo_discount_lo_orderdate': 5, 'IX_lineorder_lo_discount_lo_quantity': 6, 'IX_lineorder_lo_quantity_lo_orderdate': 7, 'IX_lineorder_lo_quantity_lo_discount': 8, 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity': 9, 'IX_lineorder

Sampling configurations: 100%|██████████| 100/100 [00:00<00:00, 990.77it/s]

Time spent on computing all pair degree of interaction: 0.10253143310546875
0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18 -> 
0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17 -> 
0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_17 -> 
0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -> 
IBG     --> Cost: 1427683.8, Used indexes: ['IX_dwdate_d_year_d_datekey']
What-if --> Cost: 1427683.8, Used indexes: ['IX_dwdate_d_year_d_datekey']

Maximum benefit of adding index IX_lineorder_lo_orderdate: 0

DOI between indexes IX_lineorder_lo_orderdate and IX_lineorder_lo_orderdate_lo_quantity : 0.0
in configuration ['IX_lineorder_lo_discount', 'IX_lineorder_lo_quantity', 'IX_lineorder_lo_discount_lo_orderdate', 'IX_lineorder_lo_discount_lo_quantity', 'IX_lineorder_lo_quantity_lo_discount']

DOI between indexes IX_lineorder_lo_orderdate and IX_lineorder_lo_orderdate_lo_quantity : 0





In [7]:
#for key, value in ibg.doi.items():
#    print(f"doi({key[0]},   {key[1]}) = {value}")

#### WFIT class

In [8]:
class WFIT:

    def __init__(self, S_0=[], max_key_columns=None, max_U=150, idxCnt=50, stateCnt=200, histSize=100, rand_cnt=1):
        # initial set of materialzed indexes
        self.S_0 = S_0
        # maximum number of key columns in an index
        self.max_key_columns = max_key_columns
        # maximum number of candidate indexes for IBG 
        self.max_U = max_U
        # parameter for maximum number of candidate indexes tracked 
        self.idxCnt = idxCnt
        # parameter for maximum number of MTS states/configurations
        self.stateCnt = stateCnt
        # parameter for maximum number of historical index statistics kept
        self.histSize = histSize
        # parameter for number of randomized clustering iterations
        self.rand_cnt = rand_cnt
        # growing list of candidate indexes (initially contains S_0)
        self.U = {index.index_id:index for index in S_0}
        # index benefit and interaction statistics
        self.idxStats = defaultdict(list)
        self.intStats = defaultdict(list)
        # list of currently monitored indexes
        self.C = {index.index_id:index for index in S_0} 
        # list of currently materialized indexes
        self.M = {index.index_id:index for index in S_0}  
        # initialize stable partitions (each partition is a singleton set of indexes from S_0)
        self.stable_partitions = [[index] for index in S_0]
        self.n_pos = 0

        print(f"##################################################################")
        # initialize work function instance for each stable partition
        self.W = self.initilize_WFA(self.stable_partitions)
        # initialize current recommendations for each stable partition
        self.current_recommendations = {i:indexes for i, indexes in enumerate(self.stable_partitions)}


        print(f"Initial set of materialized indexes: {[index.index_id for index in S_0]}")
        print(f"Stable partitions: {[[index.index_id for index in P] for P in self.stable_partitions]}")
        print(f"Initial work function instances: ")
        for i, wf in self.W.items():
            print(f"\tWFA Instance #{i}: {wf}")

        print(f"\nMaximum number of candidate indexes tracked: {idxCnt}")
        print(f"Maximum number of MTS states/configurations: {stateCnt}")
        print(f"Maximum number of historical index statistics kept: {histSize}")
        print(f"Number of randomized clustering iterations: {rand_cnt}")
        print(f"##################################################################\n")

        # set random seed
        random.seed(1234)


    # initialize a WFA instance for each stable partition
    def initilize_WFA(self, stable_partitions):
        print(f"Initializing WFA instances for {len(stable_partitions)} stable partitions...")
        W = {}
        for i, P in enumerate(stable_partitions):
            # initialize all MTS states, i.e. power set of indexes in the partition
            states = [tuple(sorted(state, key=lambda x: x.index_id)) for state in powerset(P)]
            # initialize work function instance for the partition
            W[i] = {tuple(X):self.compute_transition_cost(self.S_0, X) for X in states}    
            
        return W


    # update WFIT step for next query in workload (this is the MAIN INTERFACE for generating an index configuration recommendation)
    def process_WFIT(self, query_object, verbose=False):
        self.n_pos += 1
        
        # generate new partitions 
        if verbose: print(f"Generating new partitions for query #{self.n_pos}")
        start_time_1 = time.time()
        new_partitions, need_to_repartition, ibg = self.choose_candidates(self.n_pos, query_object, verbose)
        end_time_1 = time.time()

        # repartition if necessary
        start_time_2 = time.time()
        if need_to_repartition:
            if verbose: print(f"Repartitioning...")
            self.repartition(new_partitions, verbose)
        end_time_2 = time.time()
        
        # analyze the query
        if verbose: print(f"Analyzing query...")
        start_time_3 = time.time()
        self.analyze_query(query_object, ibg, verbose)
        end_time_3 = time.time()    

        if verbose: print(f"Currently materialized indexes: {[index.index_id for index in self.M.values()]}") 

        # remove stale indexes from U
        if verbose: print(f"Removing stale indexes from U...")
        self.remove_stale_indexes_U(verbose)

        # simple recommendation, just the used indexes in the IBG root node
        self.get_simple_recommendation_ibg(ibg)


        print(f"Total time taken for processing query #{self.n_pos}: {end_time_3 - start_time_1} seconds")
        print(f"(Partitioning: {end_time_1 - start_time_1} seconds, Repartitioning: {end_time_2 - start_time_2} seconds, Analyzing: {end_time_3 - start_time_3} seconds)")


    # Simple baseline recommendation: just the used indexes in the IBG root node, i.e. these are the indexes from 
    # the full set of candidate indexes which are used in the query plan
    def get_simple_recommendation_ibg(self, ibg):
        simple_recommendation = ibg.root.used  
        wfit_recommendation = [index.index_id for i in self.current_recommendations for index in self.current_recommendations[i]]
        print(f"*** WFIT recommendation: {sorted(wfit_recommendation)}")
        print(f"*** Simple recommendation: {sorted([index.index_id for index in simple_recommendation])}") 


    # check for stale indexes in U and remove them
    def remove_stale_indexes_U(self, verbose, min_overlapping_columns=0):
        # find out which indexes have loweest benefit statistics
        avg_benefit = {}
        for index_id in self.U:
            # compute average benefit of the index from all stats
            avg_benefit[index_id] = sum([stat[1] for stat in self.idxStats[index_id]]) / len(self.idxStats[index_id])

            
        # sort indexes by average benefit
        sorted_indexes = sorted(avg_benefit, key=avg_benefit.get, reverse=True)

        # mark all indexes with zero benefit and not in M and S_0 as stale
        stale_indexes = set()
        for index_id in sorted_indexes:
            if avg_benefit[index_id] == 0 and index_id not in self.M and index_id not in self.S_0:
                stale_indexes.add(index_id)

        # remove stale indexes from U
        print(f"Number of indexes in U: {len(self.U)}")
        num_removed = 0
        for index_id in stale_indexes:
            #if verbose: print(f"Removing stale index: {index_id}")
            del self.U[index_id]
            #if verbose: print(f"Number of indexes in U after removal: {len(self.U)}")
            num_removed += 1

        if verbose:
            #print(f"Average benefit of indexes:")
            #for index_id in sorted_indexes:
            #    print(f"\tIndex {index_id}: {avg_benefit[index_id]}, Stale: {index_id in stale_indexes}")
                
            print(f"Number of indexes removed: {num_removed}, Number of indexes remaining: {len(self.U)}")
            #print(f"Indexes in U: {self.U.keys()}")
                

    # repartition the stable partitions based on the new partitions
    def repartition(self, new_partitions, verbose):
        # all indexes recommmendations across the WFA instances from previous round
        S_curr = set(chain(*self.current_recommendations.values()))
        C = set(self.C.values()) 
        S_0 = set(self.S_0)

        # re-initizlize WFA instances and recommendations for each new partition
        if verbose: print(f"Reinitializing WFA instances...")
        W = {}
        recommendations = {}
        for i, P in enumerate(new_partitions):
            partition_all_configs = [tuple(sorted(state, key=lambda x: x.index_id)) for state in powerset(P)]
            wf = {}
            # initialize work function values for each state
            for X in partition_all_configs:
                wf_x = 0
                for j, wf_prev in self.W.items(): 
                    wf_x += wf_prev[tuple(sorted(set(X) & set(self.stable_partitions[j]), key=lambda x: x.index_id))]
                
                # add transition cost to the work function value (not sure if intersection with S_0 is correct or not..)
                wf[X] = wf_x + self.compute_transition_cost(S_0 & (set(P) - C), set(X) - C)
            
            W[i] = wf
            # initialize current state/recommended configuration of the WFA instance
            recommendations[i] = list(set(P) & S_curr)

        # replace current stable partitions, WFA instances and recommendations with the new ones
        self.stable_partitions = new_partitions
        self.W = W
        self.current_recommendations = recommendations
        if verbose: 
            print(f"Replaced stable partitions, WFA instances and recommendations with new ones")
            #print(f"New WFA instances:")
            #for i, wf in self.W.items():
            #    print(f"\tWFA Instance #{i}: {wf}")

        self.C = {}
        for P in self.stable_partitions:
            for index in P: 
                self.C[index.index_id] = index      


    # update WFA instance on each stable partition and get index configuration recommendation
    def analyze_query(self, query_object, ibg, verbose):
        new_recommendations = {}
        # update WFA instance for each stable partition
        for i in self.W:
            if verbose: print(f"Updating WFA instance: {i}")
            self.W[i], new_recommendations[i]  = self.process_WFA(query_object, self.W[i], self.current_recommendations[i], ibg, verbose)

            # materialize new recommendation
            indexes_added = set(new_recommendations[i]) - set(self.current_recommendations[i])
            indexes_removed = set(self.current_recommendations[i]) - set(new_recommendations[i])
            if verbose: print(f"\tWFA Instance #{i}, Num States: {len(self.W[i])}, New Recommendation: {[index.index_id for index in new_recommendations[i]]} --> Indexes Added: {[index.index_id for index in indexes_added]}, Indexes Removed: {[index.index_id for index in indexes_removed]}")
            
            for index in indexes_added:
                self.M[index.index_id] = index
            for index in indexes_removed:
                del self.M[index.index_id]    
                
            self.current_recommendations[i] = new_recommendations[i]

            # TODO: need to implement the following function in pg_utils
            # ... materialize_configuration(connection, indexes_added, indexes_removed)
            


    # update a WFA instance for the given query    
    def process_WFA(self, query_object, wf, S_current, ibg, verbose):
        # update work function values for each state in the WFA instance
        wf_new = {}
        p = {}
        scores = {}
        best_score = float('inf')
        best_state = None
        for Y in wf.keys():
            # compute new work function value for state Y 
            min_wf_value = float('inf')
            min_p = None
            for X in wf.keys():
                sorted_X = tuple(sorted(X, key=lambda x: x.index_id))
                sorted_Y = tuple(sorted(Y, key=lambda x: x.index_id))
                wf_value = wf[sorted_X] + ibg.get_cost_used(list(sorted_X))[0] + self.compute_transition_cost(sorted_X, sorted_Y)
                if wf_value < min_wf_value:
                    min_wf_value = wf_value
                    min_p = sorted_X

            wf_new[sorted_Y] = min_wf_value
            p[sorted_Y] = min_p

            # compute score for the state    
            score = wf_new[sorted_Y] + self.compute_transition_cost(sorted_Y, S_current)
            scores[sorted_Y] = score
            if score < best_score:
                best_score = score
                best_state = p[sorted_Y]

        if verbose:
            #print(f"Work function values for WFA instance:")
            #for Y, value in wf_new.items():
            #    print(f"\tstate :{Y} , w_value: {value}, p: {p[Y]}, score: {scores[Y]}")
            print(f"Best state: {best_state}, Best score: {best_score}")
        

        return wf_new, best_state

    # compute index benefit graph for the given query and candidate indexes
    def compute_IBG(self, query_object, candidate_indexes):
        return IBG(query_object, candidate_indexes)
    

    # extract candidate indexes from given query
    def extract_indexes(self, query_object, include_cols=False):
        return extract_query_indexes(query_object,  self.max_key_columns, include_cols)


    # generate stable partitions/sets of indexes for next query in workload
    def choose_candidates(self, n_pos, query_object, verbose):
        # extract new candidate indexes from the query
        new_indexes = self.extract_indexes(query_object)
        # add new indexes to the list of all candidate indexes
        num_new = 0
        for index in new_indexes:
            if index.index_id not in self.U:
                self.U[index.index_id] = index
                num_new += 1

        if len(self.U) > self.max_U:
            raise ValueError("Number of candidate indexes exceeds the maximum limit. Aborting WFIT...")


        if verbose: 
            print(f"Extracted {num_new} new indexes from query.")
            print(f"Candidate indexes (including those currently materialized), |U| = {len(self.U)}")
            print(f"{[index.index_id for index in self.U.values()]}")

        # TODO: need mechanism to evict indexes from U that may have gone "stale" to prevent unbounded growth of U

        
        # compute index benefit graph for the query
        if verbose: print(f"Computing IBG...")
        ibg = self.compute_IBG(query_object, list(self.U.values()))
        
        # update statistics for the candidate indexes (n_pos is the position of the query in the workload sequence)
        if verbose: print(f"Updating statistics...")
        self.update_stats(n_pos, ibg, verbose=False)

        # non-materialized candidate indexes 
        X = [self.U[index_id] for index_id in self.U if index_id not in self.M]
        num_indexes = self.idxCnt - len(self.M)

        # determine new set of candidate indexes to monitor for upcoming workload queries
        if verbose: print(f"Choosing top {num_indexes} indexes from {len(X)} non-materialized candidate indexes")
        top_indexes = self.top_indexes(n_pos, X, num_indexes, verbose)
        D = self.M | top_indexes
        if verbose: print(f"New set of indexes to monitor for upcoming workload, |D| = {len(D)}")

        # generate new partitions by clustering the new candidate set
        if verbose: print(f"Choosing new partitions...")
        new_partitions, need_to_repartition = self.choose_partition(n_pos, D, verbose)
        if verbose:
            print(f"Old partitions:")
            for P in self.stable_partitions:
                print(f"\t{[index.index_id for index in P]}")
            print("New partitions:")
            for P in new_partitions:
                print(f"\t{[index.index_id for index in P]}")    

        return new_partitions, need_to_repartition, ibg
    

    # partition the new candidate set into clusters 
    # (need to optimize this function, currently it is a naive implementation)
    def choose_partition(self, N_workload, D, verbose):
        
        # compute total loss, i.e. sum of doi across indexes from pairs of partitions
        def compute_loss(P, current_doi):
            loss = 0
            for i in range(len(P)):
                for j in range(i+1, len(P)):
                    for a in P[i]:
                        for b in P[j]:
                            loss += current_doi[(a.index_id, b.index_id)]
            return loss
        
        # compute current doi values for all pairs of indexes in U
        current_doi = defaultdict(int)
        for (a_idx, b_idx) in self.intStats.keys():
            # take max over incremental averages (optimistic estimate)
            current_doi[(a_idx, b_idx)] = 0
            doi_total = 0
            for (n, doi) in self.intStats[(a_idx, b_idx)]:
                doi_total += doi
                doi_avg = doi_total / (N_workload-n+1)
                current_doi[(a_idx, b_idx)] = max(current_doi[(a_idx, b_idx)], doi_avg)
            # save symmetric doi value
            current_doi[(b_idx, a_idx)] = current_doi[(a_idx, b_idx)]    

        #if verbose:
        #    print("Current degree of interaction:")
        #    for pair, doi in current_doi.items():
        #        print(f"\tPair {pair}: {doi}")     

        # from each current stable partition, remove indexes not in D
        P = []
        for partition in self.stable_partitions:
            P.append([index for index in partition if index.index_id in D])

        # add a singleton partition containing each new index in D not in C
        need_to_repartition = False
        for index_id, index in D.items():
            if index_id not in self.C:
                P.append([index])
                need_to_repartition = True
        
        # set the new partition as baseline solution if feasible
        total_configurations = sum([2**len(partition) for partition in P])
        if total_configurations <= self.stateCnt:
            bestSolution = P
            bestLoss = compute_loss(P, current_doi)
        else:
            bestSolution = None
            bestLoss = float('inf')    

        # perform randomized clustering to find better solution
        for i in range(self.rand_cnt):
            # create partition of D in singletons
            P = [[index] for index in D.values()]
            partition2id = {tuple(partition):i for i, partition in enumerate(P)}
            loss_cache = {}
            
            #if verbose:
            #    print(f"Parition to id map: {partition2id}")

            # merge singletons until only one partition remains
            while True:
                # find all feasible merge candidates pairs (i.e. pairs with loss > 0 and 2^(|Pi|+|Pj|) <= stateCnt)
                E = []
                E1 = []

                # get loss for all pairs of partitions
                total_configurations = sum([2**len(partition) for partition in P])
                for i in range(len(P)):
                    for j in range(i+1, len(P)):
                        Pi_id = partition2id[tuple(P[i])]
                        Pj_id = partition2id[tuple(P[j])]
                        if (Pi_id, Pj_id) in loss_cache:
                            loss = loss_cache[(Pi_id, Pj_id)]
                        else:
                            loss = compute_loss([P[i], P[j]], current_doi)
                            loss_cache[(Pi_id, Pj_id)] = loss

                        # only include feasible merge pairs, i.e. a pair which can be merged without the total number of configs exceeding stateCnt
                        total_configrations_after_merge = total_configurations - 2**len(P[i]) - 2**len(P[j]) + 2**(len(P[i]) + len(P[j]))
                        if loss > 0 and total_configrations_after_merge <= self.stateCnt:
                            E.append((P[i], P[j], loss))    
                            if len(P[i]) == 1 and len(P[j]) == 1:
                                E1.append((P[i],P[j], loss))

                #if verbose:    
                    #print(f"E pairs: {[[(index.index_id for index in Pi), (index.index_id for index in Pj), loss] for (Pi, Pj, loss) in E]}")
                    #print(f"E1 pairs: {[[(index.index_id for index in Pi), (index.index_id for index in Pj), loss] for (Pi, Pj, loss) in E1]}")

                if len(E) == 0:
                    break
                
                elif len(E1) > 0:
                    # merge a random pair of singletons, sample randomly from E1 weighted by loss (i.e. high loss pairs more likely to be merged)
                    Pi, Pj, loss = random.choices(E1, weights=[loss for (Pi, Pj, loss) in E1], k=1)[0]
                    Pij_merged = Pi + Pj
                    P.remove(Pi)
                    P.remove(Pj)
                    P.append(Pij_merged) 
                    E1.remove((Pi, Pj, loss))  
                    partition2id[tuple(Pij_merged)] = len(partition2id) 
                    #if verbose: 
                    #    print(f"Merged singleton partitions {[index.index_id for index in Pi]} and {[index.index_id for index in Pj]} with loss {loss}")

                else:
                    # merge a random pair of partitions, sample randomly from E weighted by normalized loss  
                    Pi, Pj, loss = random.choices(E, weights=[loss / (2**(len(Pi) + len(Pj)) - 2**len(Pi) - 2**len(Pj)) for (Pi, Pj, loss) in E], k=1)[0]
                    Pij_merged = Pi + Pj
                    P.remove(Pi)
                    P.remove(Pj)
                    P.append(Pij_merged)   
                    E.remove((Pi, Pj, loss)) 
                    partition2id[tuple(Pij_merged)] = len(partition2id) 
                    #if verbose:
                    #    print(f"Merged partitions {[index.index_id for index in Pi]} and {[index.index_id for index in Pj]} with loss {loss}")    

            # check if the new solution is better than the current best solution
            loss = compute_loss(P, current_doi)
            if loss < bestLoss:
                bestSolution = P
                bestLoss = loss

        return bestSolution, need_to_repartition


    # update candidate index statistics
    def update_stats(self, n, ibg, verbose):
        # update index benefit statistics
        if verbose: print("Updating index benefit statistics...")
        for index in self.U.values():
            max_benefit = ibg.compute_max_benefit(index)
            #if verbose: print(f"\tibg max benefit for index {index.index_id}: {max_benefit}")
            self.idxStats[index.index_id].append((n, max_benefit))
            #if verbose: print(f"\tIndex {index.index_id}: {self.idxStats[index.index_id]}")
            # evict old stats if the size exceeds histSize
            self.idxStats[index.index_id] = self.idxStats[index.index_id][-self.histSize:]
        
        if verbose:
            print("Index benefit statistics:")
            for index_id, stats in self.idxStats.items():
                print(f"\tIndex {index_id}: {stats}")


        # update index interaction statistics
        if verbose: print("Updating index interaction statistics...")
        for (a_idx, b_idx) in ibg.doi.keys():
            d = ibg.doi[(a_idx, b_idx)]
            #if verbose: print(f"\tibg doi for pair ({a_idx}, {b_idx}) : {d}")
            if d > 0:
                self.intStats[(a_idx, b_idx)].append((n, d))
            #if verbose: print(f"\tPair ({a_idx}, {b_idx}): {self.intStats[(a_idx, b_idx)]}")
            # evict old stats if the size exceeds histSize
            self.intStats[(a_idx, b_idx)] = self.intStats[(a_idx, b_idx)][-self.histSize:]

        if verbose:
            print("Index interaction statistics:")
            for pair, stats in self.intStats.items():
                print(f"\tPair {pair}: {stats}")


    # choose top num_indexes indexes from X with highest potential benefit
    def top_indexes(self, N_workload, X, num_indexes, verbose, positive_scores_only=False):
        if verbose:
            print(f"Non-materialized candidate indexes, X = {[index.index_id for index in X]}")

        # compute "current benefit" of each index in X (these are derived from statistics of observed benefits from recent queries)
        score = {}
        for index in X:
            if len(self.idxStats[index.index_id]) == 0:
                # zero current benefit if no statistics are available
                current_benefit = 0
            else:
                # take the maximum over all incremental average benefits (optimistic estimate)
                current_benefit = 0
                b_total = 0
                for (n, b) in self.idxStats[index.index_id]:
                    b_total += b 
                    # incremental average benefit of index up to query n (higher weight/smaller denominator for more recent queries)
                    benefit = b_total / (N_workload - n + 1)
                    current_benefit = max(current_benefit, benefit)

            # use current benefit to compute a score for the index
            if index.index_id in self.C:
                # if index already being monitored, then score is just current benefit
                score[index.index_id] = current_benefit
            else:
                # if index not being monitored, then score is current benefit minus cost of creating the index
                # (unmonitored indexes are penalized so that they are only chosen if they have high potential benefit, which helps keep C stable)
                score[index.index_id] = current_benefit - self.get_index_creation_cost(index)

        #if verbose:
        #    print("Index scores:")
        #    for index_id, s in score.items():
        #        print(f"Index {index_id}: {s}")

        # get the top num_indexes indexes with highest scores (keep non-zero scores only)
        if positive_scores_only:
            top_indexes = [index_id for index_id, s in score.items() if s > 0]
        else:
            top_indexes = [index_id for index_id, s in score.items()]    
        top_indexes = sorted(top_indexes, key=lambda x: score[x], reverse=True)[:num_indexes]
        top_indexes = {index_id: self.U[index_id] for index_id in top_indexes}

        if verbose:
            print(f"{len(top_indexes)} top indexes: {[index.index_id for index in top_indexes.values()]}")

        return top_indexes    


    # TODO: return index creation cost
    def get_index_creation_cost(self, index):
        # use a default constant for now
        return 0#1e6


    # compute transition cost between two MTS states/configurations
    def compute_transition_cost(self, S_old, S_new):
        # find out which indexes are added
        added_indexes = set(S_new) - set(S_old)
        # compute cost of creating the added indexes
        transition_cost = sum([self.get_index_creation_cost(index) for index in added_indexes])
        return transition_cost

#### Test WFIT implementation on sample SSB workload

In [9]:
# generate an SSB workload
workload = [qg.generate_query(i) for i in range(1, 10)]


In [10]:
print(workload[0])

template id: 1, query: 
                SELECT SUM(lo_extendedprice * lo_discount) AS revenue
                FROM lineorder, dwdate
                WHERE lo_orderdate = d_datekey
                AND d_year = 1993
                AND lo_discount BETWEEN 4 AND 6 
                AND lo_quantity < 35;
            , payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}, predicates: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_quantity'], 'dwdate': ['d_datekey', 'd_year']}, order by: {}, group by: {}


In [17]:
# instantiate WFIT
C = extract_query_indexes(qg.generate_query(8), include_cols=True)  
S_0 = C[0:1]
#wfit = WFIT(S_0, idxCnt=20, stateCnt=1000, histSize=100, rand_cnt=10)
wfit = WFIT(S_0, max_key_columns=3, stateCnt=200, rand_cnt=50)

# process the workload
for i, query in enumerate(workload):
    print(f"Processing query {i+1}")
    wfit.process_WFIT(query, verbose=True)
    print("\n\n")

##################################################################
Initializing WFA instances for 1 stable partitions...
Initial set of materialized indexes: ['IX_lineorder_lo_custkey']
Stable partitions: [['IX_lineorder_lo_custkey']]
Initial work function instances: 
	WFA Instance #0: {(): 0, (<pg_utils.Index object at 0x7ffaa94ada50>,): 0}

Maximum number of candidate indexes tracked: 50
Maximum number of MTS states/configurations: 200
Maximum number of historical index statistics kept: 100
Number of randomized clustering iterations: 50
##################################################################

Processing query 1
Generating new partitions for query #1
Extracted 19 new indexes from query.
Candidate indexes (including those currently materialized), |U| = 20
['IX_lineorder_lo_custkey', 'IX_lineorder_lo_orderdate', 'IX_lineorder_lo_discount', 'IX_lineorder_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_discount_lo

Sampling configurations: 100%|██████████| 100/100 [00:00<00:00, 1035.29it/s]

Time spent on computing all pair degree of interaction: 0.09826850891113281
Updating statistics...
Choosing top 49 indexes from 19 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_lineorder_lo_orderdate', 'IX_lineorder_lo_discount', 'IX_lineorder_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_discount_lo_orderdate', 'IX_lineorder_lo_discount_lo_quantity', 'IX_lineorder_lo_quantity_lo_orderdate', 'IX_lineorder_lo_quantity_lo_discount', 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_lineorder_lo_discount_lo_orderdate_lo_quantity', 'IX_lineorder_lo_discount_lo_quantity_lo_orderdate', 'IX_lineorder_lo_quantity_lo_orderdate_lo_discount', 'IX_lineorder_lo_quantity_lo_discount_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_year', 'IX_dwdate_d_datekey_d_year', 'IX_dwdate_d_year_d_datekey']
19 top indexes: ['IX_dwdate_d_year_d_dateke




No index scans were explicitly noted in the query plan.
Number of nodes in IBG: 20, Total number of what-if calls: 20, Time spent on what-if calls: 0.3319723606109619
Computing all pair degree of interaction...


Sampling configurations: 100%|██████████| 100/100 [00:00<00:00, 420.29it/s]


Time spent on computing all pair degree of interaction: 0.2394089698791504
Updating statistics...
Choosing top 49 indexes from 21 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_year', 'IX_dwdate_d_datekey_d_year', 'IX_lineorder_lo_orderdate', 'IX_lineorder_lo_discount', 'IX_lineorder_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_discount_lo_orderdate', 'IX_lineorder_lo_discount_lo_quantity', 'IX_lineorder_lo_quantity_lo_orderdate', 'IX_lineorder_lo_quantity_lo_discount', 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_lineorder_lo_discount_lo_orderdate_lo_quantity', 'IX_lineorder_lo_discount_lo_quantity_lo_orderdate', 'IX_lineorder_lo_quantity_lo_orderdate_lo_discount', 'IX_lineorder_lo_quantity_lo_discount_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_dwda

Sampling configurations: 100%|██████████| 100/100 [00:00<00:00, 204.74it/s]


Time spent on computing all pair degree of interaction: 0.48989152908325195
Updating statistics...
Choosing top 48 indexes from 23 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_year', 'IX_dwdate_d_datekey_d_year', 'IX_dwdate_d_year_d_datekey', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_lineorder_lo_orderdate', 'IX_lineorder_lo_discount', 'IX_lineorder_lo_quantity', 'IX_lineorder_lo_discount_lo_orderdate', 'IX_lineorder_lo_discount_lo_quantity', 'IX_lineorder_lo_quantity_lo_orderdate', 'IX_lineorder_lo_quantity_lo_discount', 'IX_lineorder_lo_discount_lo_orderdate_lo_quantity', 'IX_lineorder_lo_discount_lo_quantity_lo_orderdate', 'IX_lineorder_lo_quantity_lo_orderdate_lo_discount', 'IX_lineorder_lo_quantity_lo_discount_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_weeknuminyear', '

Sampling configurations: 100%|██████████| 100/100 [00:01<00:00, 51.70it/s]


Time spent on computing all pair degree of interaction: 1.9358580112457275
Updating statistics...
Choosing top 46 indexes from 44 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_datekey_d_year', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_dwdate_d_yearmonthnum_d_datekey', 'IX_lineorder_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_weeknuminyear', 'IX_dwdate_d_datekey_d_weeknuminyear', 'IX_lineorder_lo_partkey', 'IX_lineorder_lo_suppkey', 'IX_lineorder_lo_orderdate_lo_partkey', 'IX_lineorder_lo_orderdate_lo_suppkey', 'IX_lineorder_lo_partkey_lo_orderdate', 'IX_lineorder_lo_partkey_lo_suppkey', 'IX_lineorder_lo_suppkey_lo_orderdate', 'IX_lineorder_lo_suppkey_lo_partkey', 'IX_lineorder_lo_orderdate_lo_partkey_lo_suppkey', 'IX_lineorder_lo_orderdate_lo_suppkey_lo_partkey', 'IX_lineorder

Sampling configurations: 100%|██████████| 100/100 [00:01<00:00, 73.08it/s]


Time spent on computing all pair degree of interaction: 1.3698139190673828
Updating statistics...
Choosing top 48 indexes from 46 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_year', 'IX_dwdate_d_datekey_d_year', 'IX_dwdate_d_year_d_datekey', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_dwdate_d_yearmonthnum_d_datekey', 'IX_lineorder_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_weeknuminyear', 'IX_dwdate_d_datekey_d_weeknuminyear', 'IX_dwdate_d_weeknuminyear_d_datekey', 'IX_part_p_category', 'IX_part_p_category_p_partkey', 'IX_part_p_category_p_brand', 'IX_part_p_category_p_brand_p_partkey', 'IX_supplier_s_region', 'IX_lineorder_lo_partkey', 'IX_lineorder_lo_suppkey', 'IX_lineorder_lo_orderdate_lo_partkey', 'IX_lineorder_lo_orde

Sampling configurations: 100%|██████████| 100/100 [00:01<00:00, 54.50it/s]


Time spent on computing all pair degree of interaction: 1.8365278244018555
Updating statistics...
Choosing top 48 indexes from 46 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_year', 'IX_dwdate_d_datekey_d_year', 'IX_dwdate_d_year_d_datekey', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_dwdate_d_yearmonthnum_d_datekey', 'IX_lineorder_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_weeknuminyear', 'IX_dwdate_d_datekey_d_weeknuminyear', 'IX_dwdate_d_weeknuminyear_d_datekey', 'IX_part_p_category', 'IX_part_p_category_p_partkey', 'IX_part_p_category_p_brand', 'IX_part_p_category_p_partkey_p_brand', 'IX_part_p_category_p_brand_p_partkey', 'IX_supplier_s_region', 'IX_lineorder_lo_partkey', 'IX_lineorder_lo_suppkey', 'IX_lineorder_lo_orde

Sampling configurations: 100%|██████████| 100/100 [00:07<00:00, 12.91it/s]


Time spent on computing all pair degree of interaction: 7.748157501220703
Updating statistics...
Choosing top 39 indexes from 63 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_year', 'IX_dwdate_d_datekey_d_year', 'IX_dwdate_d_year_d_datekey', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_dwdate_d_yearmonthnum_d_datekey', 'IX_lineorder_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_weeknuminyear', 'IX_dwdate_d_datekey_d_weeknuminyear', 'IX_dwdate_d_weeknuminyear_d_datekey', 'IX_part_p_category', 'IX_part_p_category_p_partkey', 'IX_part_p_category_p_brand', 'IX_part_p_category_p_partkey_p_brand', 'IX_part_p_category_p_brand_p_partkey', 'IX_supplier_s_region', 'IX_lineorder_lo_custkey', 'IX_lineorder_lo_suppkey', 'IX_lineorder_lo_custk

Sampling configurations: 100%|██████████| 100/100 [00:18<00:00,  5.55it/s]


Time spent on computing all pair degree of interaction: 18.025380611419678
Updating statistics...
Choosing top 45 indexes from 76 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_datekey_d_year', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_dwdate_d_yearmonthnum_d_datekey', 'IX_lineorder_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_weeknuminyear', 'IX_dwdate_d_datekey_d_weeknuminyear', 'IX_dwdate_d_weeknuminyear_d_datekey', 'IX_part_p_category', 'IX_part_p_category_p_partkey', 'IX_part_p_category_p_brand', 'IX_part_p_category_p_partkey_p_brand', 'IX_part_p_category_p_brand_p_partkey', 'IX_supplier_s_region_s_suppkey', 'IX_part_p_brand_p_partkey', 'IX_lineorder_lo_partkey', 'IX_lineorder_lo_partkey_lo_orderdate', 'IX_lineorder_lo_pa

Sampling configurations: 100%|██████████| 100/100 [00:20<00:00,  4.96it/s]


Time spent on computing all pair degree of interaction: 20.177494764328003
Updating statistics...
Choosing top 38 indexes from 61 non-materialized candidate indexes
Non-materialized candidate indexes, X = ['IX_dwdate_d_year', 'IX_lineorder_lo_orderdate_lo_discount', 'IX_lineorder_lo_orderdate_lo_quantity', 'IX_lineorder_lo_orderdate_lo_discount_lo_quantity', 'IX_lineorder_lo_orderdate_lo_quantity_lo_discount', 'IX_dwdate_d_yearmonthnum', 'IX_dwdate_d_datekey_d_yearmonthnum', 'IX_dwdate_d_yearmonthnum_d_datekey', 'IX_lineorder_lo_orderdate', 'IX_dwdate_d_datekey', 'IX_dwdate_d_weeknuminyear', 'IX_dwdate_d_datekey_d_weeknuminyear', 'IX_dwdate_d_weeknuminyear_d_datekey', 'IX_part_p_category', 'IX_part_p_category_p_partkey', 'IX_part_p_category_p_brand', 'IX_part_p_category_p_partkey_p_brand', 'IX_part_p_category_p_brand_p_partkey', 'IX_supplier_s_region', 'IX_supplier_s_region_s_suppkey', 'IX_part_p_brand_p_partkey', 'IX_lineorder_lo_partkey', 'IX_lineorder_lo_partkey_lo_orderdate', 'IX_l